wcag_AI_validation/dependences/language_extractor.py

211 lines
8.4 KiB
Python

import asyncio
from playwright.async_api import async_playwright
from datetime import datetime, timezone
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Optional
import json
import argparse
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
import requests
import os
import urllib.parse
from pathlib import Path
class LanguageExtractor:
def __init__(
self,
url: str,
):
self.url = url
async def extract_languages(self, extract_context=True) -> Dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
page = await browser.new_page()
try:
#await page.goto(self.url, timeout=50000, wait_until="load")
#await page.wait_for_timeout(2000)
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")# faster in this case, we just need the DOM to be loaded, not necessarily all the resources
lang_only_elements = []
lang_and_xml_lang_elements = []
# Extract the lang attribute of the <html> tag
html_tag = page.locator('html')
html_tag_lang = await html_tag.get_attribute('lang')
html_tag_xml_lang = await html_tag.get_attribute('xml:lang')
if html_tag_lang and html_tag_xml_lang:
lang_and_xml_lang_elements.append(
f'<html lang="{html_tag_lang}" xml:lang="{html_tag_xml_lang}"></html>'
)
elif html_tag_lang:
lang_only_elements.append(f'<html lang="{html_tag_lang}"></html>')
# Find all elements with the lang attribute (excluding <html>)
elements_with_lang = await page.locator('//*[@lang and not(self::html)]').all()
for element in elements_with_lang:
outer_html = await element.evaluate('el => el.outerHTML')
xml_lang = await element.get_attribute('xml:lang')
if xml_lang:
lang_and_xml_lang_elements.append(outer_html)
else:
lang_only_elements.append(outer_html)
return {
"lang_only": "; ".join(lang_only_elements),
"lang_and_xml": "; ".join(lang_and_xml_lang_elements)
}
except Exception as e:
print(f"Error extracting languages: {e}")
return {"error": str(e)}
finally:
await browser.close()
"""
## quella da nodejs
from playwright.async_api import Page
async def h58(page: Page):
results = []
try:
print("Identifying the main language of the page...")
# Identify the main language of the page
main_lang = "The main language of the page is: not specified"
try:
# Playwright uses locator() or query_selector()
html_element = page.locator('html')
lang_attribute = await html_element.get_attribute('lang')
if lang_attribute:
main_lang = f"The main language of the page is: {lang_attribute}"
except Exception as e:
print(f"Error identifying main language: {e}")
print("Find all elements containing text")
# Find all elements containing text that don't have children (leaf nodes)
try:
# Playwright handles XPaths directly through the locator API
elements = await page.locator('//*[text() and not(*)]').all()
except Exception as e:
print(f"Error finding text elements: {e}")
return results
print("Create a string to collect the outer html of all the elements containing text...")
all_outer_html = ""
for element in elements:
try:
# Get the tag name
tag_name = await element.evaluate("el => el.tagName.toLowerCase()")
# Skip <html>, <style> and <script> elements
if tag_name in ['html', 'style', 'script']:
continue
# Get the outerHTML
html_content = await element.evaluate("el => el.outerHTML")
all_outer_html += html_content
# Truncate at 15,000 characters to save tokens
if len(all_outer_html) > 15000:
all_outer_html = all_outer_html[:15000] + "(...continues)"
break # Stop processing once limit is reached to save time
except Exception as e:
print(f"Error processing element: {e}")
# You can append the final result to your results list here
results.append({"main_lang": main_lang, "content": all_outer_html})
except Exception as e:
print(f"Unexpected error: {e}")
return results
"""
async def extract_content_with_lang_context(self) -> Dict:
"""
The verification is:
Read through all the text content on the page and identify any passages that are in a different language than the page default
Then check whether those passages have a lang attribute marking them correctly as being in a different language.
If a language change exists in the text but no lang attribute is present → that's a failure of H58"""
async with async_playwright() as p:
# Efficiently launch and manage the browser lifecycle
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
results = {
"main_page_lang": "not specified",
"extracted_segments": [],
"total_char_count": 0
}
try:
# Optimized wait: stop once the DOM is ready
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")
# 1. Get Root Language (Global Context)
html_tag = page.locator('html')
root_lang = await html_tag.get_attribute('lang') or "unknown"
results["main_page_lang"] = root_lang
# 2. Find Leaf Nodes containing text (The H58 Logic)
# We target elements with text but no child elements to get the 'cleanest' snippets
elements = await page.locator('//*[text() and not(*)]').all()
current_length = 0
max_length = 15000
for element in elements:
if current_length >= max_length:
results["extracted_segments"].append("...[Truncated: Limit Reached]")
break
try:
# Skip non-content tags
tag_name = await element.evaluate("el => el.tagName.toLowerCase()")
if tag_name in ['script', 'style', 'noscript', 'html']:
continue
# Get local language context (The extract_languages logic)
local_lang = await element.get_attribute('lang')
#outer_html = await element.evaluate("el => el.outerHTML")
clean_text = await element.inner_text()
clean_text = clean_text.strip()
if not clean_text:
continue
# Package the data: Text + its specific language metadata
segment = {
"tag": tag_name,
"lang": local_lang if local_lang else "inherited",
"html": clean_text
}
results["extracted_segments"].append(segment)
current_length += len(clean_text)
except Exception as e:
# Silently skip individual element errors to keep the loop moving
continue
results["total_char_count"] = current_length
return results
except Exception as e:
return {"error": str(e)}
finally:
await browser.close()