tecnica g88
This commit is contained in:
parent
aea0d9fef8
commit
f81c5aad2b
|
|
@ -0,0 +1,211 @@
|
|||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import List, Dict, Optional
|
||||
import json
|
||||
import argparse
|
||||
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
|
||||
import requests
|
||||
import os
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class LanguageExtractor:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
url: str,
|
||||
):
|
||||
|
||||
self.url = url
|
||||
|
||||
|
||||
async def extract_languages(self, extract_context=True) -> Dict:
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
#await page.goto(self.url, timeout=50000, wait_until="load")
|
||||
#await page.wait_for_timeout(2000)
|
||||
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")# faster in this case, we just need the DOM to be loaded, not necessarily all the resources
|
||||
|
||||
lang_only_elements = []
|
||||
lang_and_xml_lang_elements = []
|
||||
|
||||
# Extract the lang attribute of the <html> tag
|
||||
html_tag = page.locator('html')
|
||||
html_tag_lang = await html_tag.get_attribute('lang')
|
||||
html_tag_xml_lang = await html_tag.get_attribute('xml:lang')
|
||||
|
||||
if html_tag_lang and html_tag_xml_lang:
|
||||
lang_and_xml_lang_elements.append(
|
||||
f'<html lang="{html_tag_lang}" xml:lang="{html_tag_xml_lang}"></html>'
|
||||
)
|
||||
elif html_tag_lang:
|
||||
lang_only_elements.append(f'<html lang="{html_tag_lang}"></html>')
|
||||
|
||||
# Find all elements with the lang attribute (excluding <html>)
|
||||
elements_with_lang = await page.locator('//*[@lang and not(self::html)]').all()
|
||||
|
||||
for element in elements_with_lang:
|
||||
outer_html = await element.evaluate('el => el.outerHTML')
|
||||
xml_lang = await element.get_attribute('xml:lang')
|
||||
if xml_lang:
|
||||
lang_and_xml_lang_elements.append(outer_html)
|
||||
else:
|
||||
lang_only_elements.append(outer_html)
|
||||
|
||||
return {
|
||||
"lang_only": "; ".join(lang_only_elements),
|
||||
"lang_and_xml": "; ".join(lang_and_xml_lang_elements)
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting languages: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
||||
|
||||
|
||||
"""
|
||||
## quella da nodejs
|
||||
from playwright.async_api import Page
|
||||
|
||||
async def h58(page: Page):
|
||||
results = []
|
||||
|
||||
try:
|
||||
print("Identifying the main language of the page...")
|
||||
# Identify the main language of the page
|
||||
main_lang = "The main language of the page is: not specified"
|
||||
try:
|
||||
# Playwright uses locator() or query_selector()
|
||||
html_element = page.locator('html')
|
||||
lang_attribute = await html_element.get_attribute('lang')
|
||||
if lang_attribute:
|
||||
main_lang = f"The main language of the page is: {lang_attribute}"
|
||||
except Exception as e:
|
||||
print(f"Error identifying main language: {e}")
|
||||
|
||||
print("Find all elements containing text")
|
||||
# Find all elements containing text that don't have children (leaf nodes)
|
||||
try:
|
||||
# Playwright handles XPaths directly through the locator API
|
||||
elements = await page.locator('//*[text() and not(*)]').all()
|
||||
except Exception as e:
|
||||
print(f"Error finding text elements: {e}")
|
||||
return results
|
||||
|
||||
print("Create a string to collect the outer html of all the elements containing text...")
|
||||
all_outer_html = ""
|
||||
|
||||
for element in elements:
|
||||
try:
|
||||
# Get the tag name
|
||||
tag_name = await element.evaluate("el => el.tagName.toLowerCase()")
|
||||
|
||||
# Skip <html>, <style> and <script> elements
|
||||
if tag_name in ['html', 'style', 'script']:
|
||||
continue
|
||||
|
||||
# Get the outerHTML
|
||||
html_content = await element.evaluate("el => el.outerHTML")
|
||||
all_outer_html += html_content
|
||||
|
||||
# Truncate at 15,000 characters to save tokens
|
||||
if len(all_outer_html) > 15000:
|
||||
all_outer_html = all_outer_html[:15000] + "(...continues)"
|
||||
break # Stop processing once limit is reached to save time
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing element: {e}")
|
||||
|
||||
# You can append the final result to your results list here
|
||||
results.append({"main_lang": main_lang, "content": all_outer_html})
|
||||
|
||||
except Exception as e:
|
||||
print(f"Unexpected error: {e}")
|
||||
|
||||
return results
|
||||
"""
|
||||
async def extract_content_with_lang_context(self) -> Dict:
|
||||
"""
|
||||
The verification is:
|
||||
Read through all the text content on the page and identify any passages that are in a different language than the page default
|
||||
Then check whether those passages have a lang attribute marking them correctly as being in a different language.
|
||||
If a language change exists in the text but no lang attribute is present → that's a failure of H58"""
|
||||
|
||||
async with async_playwright() as p:
|
||||
# Efficiently launch and manage the browser lifecycle
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
context = await browser.new_context()
|
||||
page = await context.new_page()
|
||||
|
||||
results = {
|
||||
"main_page_lang": "not specified",
|
||||
"extracted_segments": [],
|
||||
"total_char_count": 0
|
||||
}
|
||||
|
||||
try:
|
||||
# Optimized wait: stop once the DOM is ready
|
||||
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")
|
||||
|
||||
# 1. Get Root Language (Global Context)
|
||||
html_tag = page.locator('html')
|
||||
root_lang = await html_tag.get_attribute('lang') or "unknown"
|
||||
results["main_page_lang"] = root_lang
|
||||
|
||||
# 2. Find Leaf Nodes containing text (The H58 Logic)
|
||||
# We target elements with text but no child elements to get the 'cleanest' snippets
|
||||
elements = await page.locator('//*[text() and not(*)]').all()
|
||||
|
||||
current_length = 0
|
||||
max_length = 15000
|
||||
|
||||
for element in elements:
|
||||
if current_length >= max_length:
|
||||
results["extracted_segments"].append("...[Truncated: Limit Reached]")
|
||||
break
|
||||
|
||||
try:
|
||||
# Skip non-content tags
|
||||
tag_name = await element.evaluate("el => el.tagName.toLowerCase()")
|
||||
if tag_name in ['script', 'style', 'noscript', 'html']:
|
||||
continue
|
||||
|
||||
# Get local language context (The extract_languages logic)
|
||||
local_lang = await element.get_attribute('lang')
|
||||
#outer_html = await element.evaluate("el => el.outerHTML")
|
||||
clean_text = await element.inner_text()
|
||||
clean_text = clean_text.strip()
|
||||
if not clean_text:
|
||||
continue
|
||||
|
||||
# Package the data: Text + its specific language metadata
|
||||
segment = {
|
||||
"tag": tag_name,
|
||||
"lang": local_lang if local_lang else "inherited",
|
||||
"html": clean_text
|
||||
}
|
||||
|
||||
results["extracted_segments"].append(segment)
|
||||
current_length += len(clean_text)
|
||||
|
||||
except Exception as e:
|
||||
# Silently skip individual element errors to keep the loop moving
|
||||
continue
|
||||
|
||||
results["total_char_count"] = current_length
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
return {"error": str(e)}
|
||||
finally:
|
||||
await browser.close()
|
||||
|
|
@ -2,15 +2,21 @@ from dependences.utils import call_API_urlibrequest, encode_image_from_url
|
|||
import json
|
||||
import re
|
||||
|
||||
|
||||
class MLLMManager:
|
||||
def __init__(self, end_point, api_key, model_id):
|
||||
self.end_point = end_point
|
||||
self.api_key = api_key
|
||||
self.model_id = model_id
|
||||
|
||||
def get_response(self, system_prompt, user_prompt, openai_model=False):
|
||||
def get_response(
|
||||
self, system_prompt, user_prompt, openai_model=False, is_only_textual=False
|
||||
):
|
||||
payload = self.create_mllm_payload(
|
||||
system_prompt, user_prompt, openai_model=openai_model
|
||||
system_prompt,
|
||||
user_prompt,
|
||||
openai_model=openai_model,
|
||||
is_only_textual=is_only_textual,
|
||||
)
|
||||
# print("LLM full payload:", payload)
|
||||
headers = [
|
||||
|
|
@ -37,6 +43,7 @@ class MLLMManager:
|
|||
system_prompt,
|
||||
user_prompt,
|
||||
openai_model=False,
|
||||
is_only_textual=False,
|
||||
):
|
||||
if openai_model:
|
||||
print("Creating OpenAI format payload")
|
||||
|
|
@ -54,29 +61,49 @@ class MLLMManager:
|
|||
}
|
||||
else: # ollama format
|
||||
print("Creating alternative LLM format payload")
|
||||
payload = {
|
||||
"model": self.model_id,
|
||||
"stream": False,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt["user_prompt"],
|
||||
"images": [user_prompt["image_base64"]],
|
||||
if is_only_textual:
|
||||
payload = {
|
||||
"model": self.model_id,
|
||||
"stream": False,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt["user_prompt"]},
|
||||
],
|
||||
"options": {
|
||||
# "seed": 123,
|
||||
"temperature": 0.7,
|
||||
"num_ctx": 8192, # max input token
|
||||
"num_predict": 800, # max output tokens
|
||||
"top_p": 0.95,
|
||||
},
|
||||
],
|
||||
"options": {
|
||||
#"seed": 123,
|
||||
"temperature": 0.7,
|
||||
"num_ctx": 8192, # max input token
|
||||
"num_predict": 800, # max output tokens
|
||||
"top_p": 0.95,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
else:
|
||||
payload = {
|
||||
"model": self.model_id,
|
||||
"stream": False,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{
|
||||
"role": "user",
|
||||
"content": user_prompt["user_prompt"],
|
||||
"images": [user_prompt["image_base64"]],
|
||||
},
|
||||
],
|
||||
"options": {
|
||||
# "seed": 123,
|
||||
"temperature": 0.7,
|
||||
"num_ctx": 8192, # max input token
|
||||
"num_predict": 800, # max output tokens
|
||||
"top_p": 0.95,
|
||||
},
|
||||
}
|
||||
return payload
|
||||
|
||||
# --------alt text evaluation specific methods ---------
|
||||
|
||||
def get_alt_text_system_prompt(self):
|
||||
|
||||
|
||||
# https://www.w3.org/WAI/WCAG22/Techniques/general/G94 without examples
|
||||
system_prompt = """You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for
|
||||
images on webpages are appropriate according to WCAG guidelines. The alt-text should serve the same purpose and present
|
||||
|
|
@ -124,6 +151,40 @@ class MLLMManager:
|
|||
|
||||
return system_prompt
|
||||
|
||||
def get_g88_system_prompt(self):
|
||||
|
||||
# https://www.w3.org/WAI/WCAG22/Techniques/general/G88 without examples
|
||||
|
||||
system_prompt = """You are a web accessibility evaluation tool.
|
||||
Your task is to determine if web pages have a descriptive title, according to WCAG guidelines.
|
||||
The objective of this technique is to give each web page a descriptive title. Descriptive titles help users find content, orient themselves within it, and navigate through it. A descriptive title allows a user to easily identify what web page they are using and to tell when the web page has changed. The title can be used to identify the web page without requiring users to read or interpret page content. Users can more quickly identify the content they need when accurate, descriptive titles appear in site maps or lists of search results. When descriptive titles are used within link text, they help users navigate more precisely to the content they are interested in.
|
||||
The title of each web page should:
|
||||
- Identify the subject of the web page
|
||||
- Make sense when read out of context, for example by a screen reader or in a site map or list of search results
|
||||
- Be short
|
||||
|
||||
Follow these instructions carefully:
|
||||
1. You will be provided with the following:
|
||||
-The <title> content (if present, and if absent, acknowledge this in your evaluation).
|
||||
-The main section and headings of the page as context.
|
||||
|
||||
2. Determine if the page title is descriptive, by comparing its semantic meaning with the partial context provided.
|
||||
|
||||
3. Provide a judgment based on the following:
|
||||
- 'success' If you can determine with sufficient certainty that the page title is meaningful for the purpose and content of the page,
|
||||
- 'failure' If you can determine with sufficient certainty that it is not meaningful,
|
||||
- 'warning' if you cannot determine with 'sufficient certainty'.
|
||||
where the level of certainty goes from 1 to 100 and 'sufficient certainty' means > 80
|
||||
|
||||
4. Provide the assessment on a scale from 1 to 5, where 5 is the best score. Use an integer number only. Note: assessmnet and judgment should be consistent but their purpose is different.
|
||||
|
||||
5. Provide a brief reasoning for your judgment. Your response should be in English. Keep your response within 100 words.
|
||||
|
||||
6. Here is the JSON format the result must have:
|
||||
{"Assessment" : "*your assessment*", "Judgment" : "*your judgment*", "EvaluationResult": "*your response*"}"""
|
||||
|
||||
return system_prompt
|
||||
|
||||
def get_alt_text_user_prompt( # the user_prompt is specific to the platform used (openai, ollama)
|
||||
self, altTextMessage, imageURL, HTMLcontext, pageText, openai_model=True
|
||||
):
|
||||
|
|
@ -143,6 +204,23 @@ class MLLMManager:
|
|||
|
||||
return user_prompt
|
||||
|
||||
def get_standard_textual_user_prompt( # the user_prompt is specific to the platform used (openai, ollama)
|
||||
self, texts, openai_model=True
|
||||
):
|
||||
|
||||
if openai_model:
|
||||
user_prompt = []
|
||||
for text in texts:
|
||||
partial_user_prompt = {"type": "text", "text": text}
|
||||
user_prompt.append(partial_user_prompt)
|
||||
else:
|
||||
user_prompt = ""
|
||||
for text in texts:
|
||||
user_prompt = user_prompt + " " + text
|
||||
user_prompt = {"user_prompt": user_prompt}
|
||||
|
||||
return user_prompt
|
||||
|
||||
def make_alt_text_evaluation(
|
||||
self,
|
||||
images,
|
||||
|
|
@ -151,7 +229,7 @@ class MLLMManager:
|
|||
print("Using end_point:", self.end_point)
|
||||
|
||||
alt_text_system_prompt = self.get_alt_text_system_prompt()
|
||||
#print("alt_text_system_prompt:", alt_text_system_prompt)
|
||||
# print("alt_text_system_prompt:", alt_text_system_prompt)
|
||||
|
||||
mllm_responses = []
|
||||
for img_info in images:
|
||||
|
|
@ -204,11 +282,60 @@ class MLLMManager:
|
|||
mllm_responses.append(report)
|
||||
return mllm_responses
|
||||
|
||||
# --- end of alt text evaluation specific methods ---------
|
||||
|
||||
def make_h58_evaluation(
|
||||
self,
|
||||
main_language,
|
||||
other_textual_elements,
|
||||
openai_model=False,
|
||||
):
|
||||
print("Using end_point:", self.end_point)
|
||||
print(
|
||||
"make_h58_evaluation - main_language:",
|
||||
main_language,
|
||||
"other_textual_elements:",
|
||||
other_textual_elements,
|
||||
)
|
||||
mllm_responses = []
|
||||
report = {
|
||||
"mllm_response": "",
|
||||
}
|
||||
mllm_responses.append(report)
|
||||
return mllm_responses
|
||||
|
||||
def make_g88_evaluation(
|
||||
self,
|
||||
title_content,
|
||||
openai_model=False,
|
||||
):
|
||||
|
||||
system_prompt = self.get_g88_system_prompt()
|
||||
|
||||
page_title = "The title of the page is: " + str(title_content["title"] + ". ")
|
||||
structural_content = (
|
||||
"Here is the content of the page (<main> tag, headings):"
|
||||
+ str(title_content["structural_content"])
|
||||
)
|
||||
user_prompt = self.get_standard_textual_user_prompt(
|
||||
texts=[page_title, structural_content], openai_model=openai_model
|
||||
)
|
||||
|
||||
mllm_response = self.get_response(
|
||||
system_prompt=system_prompt,
|
||||
user_prompt=user_prompt,
|
||||
openai_model=openai_model,
|
||||
is_only_textual=True,
|
||||
)
|
||||
|
||||
report = {
|
||||
"mllm_response": mllm_response,
|
||||
}
|
||||
|
||||
return report
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
### Other utility functions
|
||||
def parse_mllm_alt_text_response(mllm_response):
|
||||
"""
|
||||
Parse an MLLM response string and extract key attributes into a JSON object.
|
||||
|
|
@ -216,10 +343,10 @@ def parse_mllm_alt_text_response(mllm_response):
|
|||
from mllm response like:
|
||||
```json\n{\n\"Original alt-text assessment\"... etc
|
||||
to a structured dictionary.
|
||||
|
||||
|
||||
Args:
|
||||
mllm_response (str): The raw MLLM response text containing JSON data
|
||||
|
||||
|
||||
Returns:
|
||||
dict: A dictionary containing the extracted attributes, or None if parsing fails
|
||||
"""
|
||||
|
|
@ -230,46 +357,50 @@ def parse_mllm_alt_text_response(mllm_response):
|
|||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
"new_alt_text": None,
|
||||
}
|
||||
|
||||
|
||||
# Extract JSON content between ```json and ``` markers
|
||||
json_match = re.search(r'```json\s*(.*?)\s*```', mllm_response, re.DOTALL)
|
||||
|
||||
json_match = re.search(r"```json\s*(.*?)\s*```", mllm_response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
# Try to find JSON without markdown code blocks
|
||||
json_match = re.search(r'\{.*\}', mllm_response, re.DOTALL)
|
||||
|
||||
json_match = re.search(r"\{.*\}", mllm_response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
return {
|
||||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
"new_alt_text": None,
|
||||
}
|
||||
|
||||
json_str = json_match.group(1) if '```json' in mllm_response else json_match.group(0)
|
||||
|
||||
|
||||
json_str = (
|
||||
json_match.group(1) if "```json" in mllm_response else json_match.group(0)
|
||||
)
|
||||
|
||||
# Parse the JSON string
|
||||
parsed_data = json.loads(json_str)
|
||||
|
||||
|
||||
# Create a structured output with the key attributes
|
||||
result = {
|
||||
"original_alt_text_assessment": parsed_data.get("Original alt-text assessment", ""),
|
||||
"original_alt_text_assessment": parsed_data.get(
|
||||
"Original alt-text assessment", ""
|
||||
),
|
||||
"assessment": parsed_data.get("Assessment", ""),
|
||||
"evaluation_result": parsed_data.get("EvaluationResult", ""),
|
||||
"new_alt_text": parsed_data.get("New alt-text", "")
|
||||
"new_alt_text": parsed_data.get("New alt-text", ""),
|
||||
}
|
||||
|
||||
|
||||
return result
|
||||
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON parsing error: {e}")
|
||||
return {
|
||||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
"new_alt_text": None,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error parsing MLLM response: {e}")
|
||||
|
|
@ -277,5 +408,63 @@ def parse_mllm_alt_text_response(mllm_response):
|
|||
"original_alt_text_assessment": None,
|
||||
"assessment": None,
|
||||
"evaluation_result": None,
|
||||
"new_alt_text": None
|
||||
}
|
||||
"new_alt_text": None,
|
||||
}
|
||||
|
||||
|
||||
def parse_mllm_standard_response(mllm_response):
|
||||
|
||||
try:
|
||||
# Handle NaN or None values
|
||||
if mllm_response is None or mllm_response == "":
|
||||
return {
|
||||
"assessment": None,
|
||||
"judgment": None,
|
||||
"evaluation_result": None,
|
||||
}
|
||||
# Extract JSON content between ```json and ``` markers
|
||||
json_match = re.search(r"```json\s*(.*?)\s*```", mllm_response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
# Try to find JSON without markdown code blocks
|
||||
json_match = re.search(r"\{.*\}", mllm_response, re.DOTALL)
|
||||
|
||||
if not json_match:
|
||||
return {
|
||||
"assessment": None,
|
||||
"judgment": None,
|
||||
"evaluation_result": None,
|
||||
}
|
||||
|
||||
json_str = (
|
||||
json_match.group(1) if "```json" in mllm_response else json_match.group(0)
|
||||
)
|
||||
|
||||
print("Extracted JSON string from MLLM response:", json_str)
|
||||
|
||||
# Parse the JSON string
|
||||
parsed_data = json.loads(json_str)
|
||||
|
||||
# Create a structured output with the key attributes
|
||||
result = {
|
||||
"assessment": parsed_data.get("Assessment", ""),
|
||||
"judgment": parsed_data.get("Judgment", ""),
|
||||
"evaluation_result": parsed_data.get("EvaluationResult", ""),
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
print(f"JSON parsing error: {e}")
|
||||
return {
|
||||
"assessment": None,
|
||||
"judgment": None,
|
||||
"evaluation_result": None,
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error parsing MLLM response: {e}")
|
||||
return {
|
||||
"assessment": None,
|
||||
"judgment": None,
|
||||
"evaluation_result": None,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,73 @@
|
|||
import asyncio
|
||||
from playwright.async_api import async_playwright
|
||||
from datetime import datetime, timezone
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import List, Dict, Optional
|
||||
import json
|
||||
import argparse
|
||||
from dependences.utils import disclaim_bool_string, prepare_output_folder, create_folder
|
||||
import requests
|
||||
import os
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class PageTitleExtractor:
|
||||
|
||||
def __init__(self, url: str, threshold: int = 200):
|
||||
self.url = url
|
||||
self.threshold = threshold
|
||||
|
||||
async def extract_page_title(self) -> Dict:
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(headless=True)
|
||||
page = await browser.new_page()
|
||||
|
||||
try:
|
||||
await page.goto(self.url, timeout=50000, wait_until="domcontentloaded")
|
||||
|
||||
# Extract the page title
|
||||
title = await page.title()
|
||||
if not title:
|
||||
title = "<title></title>"
|
||||
|
||||
# Extract headings and main content in a single JS call
|
||||
# Using an f-string to inject self.threshold
|
||||
# Note the double {{ }} for the JS logic to prevent Python errors
|
||||
structural_text = await page.evaluate(
|
||||
f"""
|
||||
() => {{
|
||||
const threshold = {self.threshold};
|
||||
//const elements = document.querySelectorAll('h1, h2, h3, main');
|
||||
const elements = document.querySelectorAll('h1, main');// we want to focus on main and h1 for the g88 evaluation, to have a more concise output for the LLM, but this can be easily changed to include more tags if needed
|
||||
|
||||
return Array.from(elements)
|
||||
.map(el => {{
|
||||
const tag = el.tagName.toLowerCase();
|
||||
let text = el.innerText.replace(/\\n/g, ' ').trim();
|
||||
|
||||
if (text.length > threshold) {{
|
||||
text = text.substring(0, threshold) + '...';
|
||||
}}
|
||||
|
||||
return text ? `<${{tag}}>${{text}}</${{tag}}>` : null;
|
||||
}})
|
||||
.filter(Boolean)
|
||||
.join(' ');
|
||||
}}
|
||||
"""
|
||||
)
|
||||
|
||||
return {
|
||||
"page_url": self.url,
|
||||
"title": title,
|
||||
"structural_content": structural_text,
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error extracting page title: {e}")
|
||||
return {"error": str(e)}
|
||||
|
||||
finally:
|
||||
await browser.close()
|
||||
|
|
@ -9,6 +9,7 @@ import base64
|
|||
import sqlite3
|
||||
from PIL import Image
|
||||
import io
|
||||
from datetime import datetime, timezone
|
||||
|
||||
exception_msg = "Exception: %s"
|
||||
|
||||
|
|
@ -116,6 +117,24 @@ def prepare_output_folder(file, now_str):
|
|||
return output_dir
|
||||
|
||||
|
||||
def prepare_folder_path(json_content, mllm_model_id, tecnhnique_name=""):
|
||||
url_path = (
|
||||
json_content["page_url"]
|
||||
.replace(":", "")
|
||||
.replace("//", "_")
|
||||
.replace("/", "_")
|
||||
.replace("%2", "_")
|
||||
.replace("?", "_")
|
||||
.replace("=", "_")
|
||||
.replace("&", "_")
|
||||
)
|
||||
url_path = url_path[:50] # limit length
|
||||
now = datetime.now(timezone.utc)
|
||||
now_str = now.strftime("%Y_%m_%d-%H_%M_%S")
|
||||
folder_str = mllm_model_id.replace(":", "-") + "_" + tecnhnique_name + "_" + now_str
|
||||
return url_path, folder_str
|
||||
|
||||
|
||||
def create_folder(root_path, directory_separator, next_path):
|
||||
output_dir = root_path + directory_separator + next_path
|
||||
try:
|
||||
|
|
@ -131,19 +150,19 @@ def create_folder(root_path, directory_separator, next_path):
|
|||
|
||||
def encode_image_from_url(image_url):
|
||||
response = requests.get(image_url)
|
||||
|
||||
|
||||
# Open image and convert to RGB
|
||||
image = Image.open(io.BytesIO(response.content))
|
||||
|
||||
|
||||
# Convert to RGB (handles RGBA, grayscale, etc.)
|
||||
if image.mode != 'RGB':
|
||||
image = image.convert('RGB')
|
||||
|
||||
if image.mode != "RGB":
|
||||
image = image.convert("RGB")
|
||||
|
||||
# Save to bytes buffer
|
||||
buffer = io.BytesIO()
|
||||
image.save(buffer, format='PNG') # or 'JPEG'
|
||||
image.save(buffer, format="PNG") # or 'JPEG'
|
||||
buffer.seek(0)
|
||||
|
||||
|
||||
# Encode to base64
|
||||
return base64.b64encode(buffer.getvalue()).decode("utf-8")
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,153 @@
|
|||
from fastapi import APIRouter, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
import json
|
||||
import aiofiles
|
||||
import asyncio
|
||||
|
||||
from dependences.utils import (
|
||||
disclaim_bool_string,
|
||||
prepare_output_folder,
|
||||
prepare_folder_path,
|
||||
create_folder,
|
||||
db_persistence_insert,
|
||||
)
|
||||
from dependences.title_content_extractor import PageTitleExtractor
|
||||
from dependences.mllm_management import MLLMManager, parse_mllm_standard_response
|
||||
|
||||
invalid_json_input_msg = "Invalid JSON format"
|
||||
unexpected_error_msg = "Unexpected Error: could not end the process"
|
||||
|
||||
|
||||
class WCAG_g88Valuation(BaseModel):
|
||||
page_url: str = "https://www.bbc.com"
|
||||
|
||||
save_elaboration: str = "True"
|
||||
|
||||
|
||||
class WCAG_g88ValuationRoutes:
|
||||
|
||||
def __init__(self, connection_db, mllm_settings):
|
||||
self.connection_db = connection_db
|
||||
self.mllm_settings = mllm_settings
|
||||
self.router = APIRouter()
|
||||
|
||||
self.router.add_api_route(
|
||||
"/wcag_g88_validation",
|
||||
self.wcag_g88_validation,
|
||||
methods=["POST"],
|
||||
tags=["Wcag G88 Validation"],
|
||||
description="WCAG validator G88 validation: Providing descriptive titles for web pages",
|
||||
name="wcag G88 validation",
|
||||
dependencies=[],
|
||||
)
|
||||
|
||||
logging.info("wcag g88 routes correctly initialized.")
|
||||
|
||||
async def wcag_g88_validation(
|
||||
self, request: Request, data: WCAG_g88Valuation
|
||||
) -> JSONResponse:
|
||||
|
||||
try:
|
||||
print("Received wcag G88 validation request.")
|
||||
json_content = json.loads(data.model_dump_json())
|
||||
mllm_model_id = self.mllm_settings["mllm_model_id"]
|
||||
|
||||
# prepare output folders if needed---
|
||||
images_output_dir = ""
|
||||
if (
|
||||
disclaim_bool_string(json_content["save_elaboration"]) == True
|
||||
): # if something to save
|
||||
|
||||
url_path, folder_str = prepare_folder_path(
|
||||
json_content, mllm_model_id, tecnhnique_name="h58"
|
||||
)
|
||||
output_dir = prepare_output_folder(url_path, folder_str)
|
||||
|
||||
# ---------------------
|
||||
|
||||
# Create title and content extractor
|
||||
title_content_extractor = PageTitleExtractor(
|
||||
json_content["page_url"], threshold=800
|
||||
)
|
||||
# Extract title and content
|
||||
logging.info(f"Extracting title-content from: {json_content['page_url']}")
|
||||
title_content = await title_content_extractor.extract_page_title()
|
||||
print("Extracted title_content.", title_content)
|
||||
|
||||
# MLLM settings
|
||||
mllm_end_point = self.mllm_settings["mllm_end_point"]
|
||||
mllm_api_key = self.mllm_settings["mllm_api_key"]
|
||||
|
||||
logging.info("mllm_end_point:%s", mllm_end_point)
|
||||
logging.info("mllm_model_id:%s", mllm_model_id)
|
||||
|
||||
# Create MLLM manager
|
||||
mllm_manager = MLLMManager(mllm_end_point, mllm_api_key, mllm_model_id)
|
||||
logging.info("mllm_manager.end_point:%s", mllm_manager.end_point)
|
||||
# Make h88 evaluation
|
||||
mllm_responses = mllm_manager.make_g88_evaluation(
|
||||
title_content=title_content,
|
||||
openai_model=self.mllm_settings["openai_model"],
|
||||
)
|
||||
parsed_mllm_responses = parse_mllm_standard_response(
|
||||
mllm_responses["mllm_response"]
|
||||
)
|
||||
mllm_responses_object = {"mllm_g88_assessments": parsed_mllm_responses}
|
||||
|
||||
returned_object = {
|
||||
"title_content": title_content,
|
||||
"mllm_validations": mllm_responses_object,
|
||||
}
|
||||
|
||||
try:
|
||||
# Persist to local db
|
||||
# Convert JSON data to string
|
||||
json_in_str = json.dumps(title_content, ensure_ascii=False)
|
||||
json_out_str = json.dumps(mllm_responses_object, ensure_ascii=False)
|
||||
|
||||
db_persistence_insert(
|
||||
connection_db=self.connection_db,
|
||||
insert_type="wcag_g88_validation",
|
||||
page_url=json_content["page_url"],
|
||||
llm_model=mllm_model_id,
|
||||
json_in_str=json_in_str,
|
||||
json_out_str=json_out_str,
|
||||
table="wcag_validator_results",
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("error persisting to local db: %s", e)
|
||||
|
||||
if (
|
||||
disclaim_bool_string(json_content["save_elaboration"]) == True
|
||||
): # Optionally save to JSON
|
||||
|
||||
# save mllm input and responses
|
||||
"""with open(
|
||||
output_dir + "/mllm_assessments.json", "w", encoding="utf-8"
|
||||
) as f:
|
||||
json.dump(returned_object, f, indent=2, ensure_ascii=False)"""
|
||||
|
||||
# async version
|
||||
async with aiofiles.open(
|
||||
output_dir + "/mllm_assessments.json", "w", encoding="utf-8"
|
||||
) as f:
|
||||
await f.write(
|
||||
json.dumps(returned_object, indent=2, ensure_ascii=False)
|
||||
)
|
||||
|
||||
return JSONResponse(content=returned_object, status_code=200)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logging.error(invalid_json_input_msg)
|
||||
return JSONResponse(
|
||||
content={"error": invalid_json_input_msg}, status_code=400
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(unexpected_error_msg + " %s", e)
|
||||
return JSONResponse(
|
||||
content={"error": unexpected_error_msg}, status_code=500
|
||||
)
|
||||
|
|
@ -0,0 +1,156 @@
|
|||
from fastapi import APIRouter, Request
|
||||
from fastapi.responses import JSONResponse
|
||||
import logging
|
||||
from pydantic import BaseModel
|
||||
import json
|
||||
|
||||
|
||||
from dependences.utils import (
|
||||
disclaim_bool_string,
|
||||
prepare_output_folder,
|
||||
prepare_folder_path,
|
||||
create_folder,
|
||||
db_persistence_insert,
|
||||
)
|
||||
from dependences.language_extractor import LanguageExtractor
|
||||
from dependences.mllm_management import MLLMManager, parse_mllm_alt_text_response
|
||||
|
||||
invalid_json_input_msg = "Invalid JSON format"
|
||||
unexpected_error_msg = "Unexpected Error: could not end the process"
|
||||
|
||||
|
||||
class WCAG_h58Valuation(BaseModel):
|
||||
page_url: str = "https://www.bbc.com"
|
||||
#context_levels: int = 5
|
||||
#pixel_distance_threshold: int = 200
|
||||
#number_of_images: int = 10
|
||||
#save_images: str = "True"
|
||||
save_elaboration: str = "True"
|
||||
#specific_images_urls: List[str] = []
|
||||
|
||||
|
||||
class WCAG_h58ValuationRoutes:
|
||||
|
||||
def __init__(self, connection_db, mllm_settings):
|
||||
self.connection_db = connection_db
|
||||
self.mllm_settings = mllm_settings
|
||||
self.router = APIRouter()
|
||||
|
||||
self.router.add_api_route(
|
||||
"/wcag_h58_validation",
|
||||
self.wcag_h58_validation,
|
||||
methods=["POST"],
|
||||
tags=["Wcag H58 Validation"],
|
||||
description="WCAG validator H58 validation: Using language attributes to identify changes in the human language",
|
||||
name="wcag H58 validation",
|
||||
dependencies=[],
|
||||
)
|
||||
|
||||
logging.info("wcag h58 routes correctly initialized.")
|
||||
|
||||
async def wcag_h58_validation(
|
||||
self, request: Request, data: WCAG_h58Valuation
|
||||
) -> JSONResponse:
|
||||
"""Return the alt text validation assessment based on WCAG guidelines"""
|
||||
try:
|
||||
print("Received wcag H58 validation request.")
|
||||
json_content = json.loads(data.model_dump_json())
|
||||
mllm_model_id = self.mllm_settings["mllm_model_id"]
|
||||
|
||||
# prepare output folders if needed---
|
||||
if (
|
||||
disclaim_bool_string(json_content["save_elaboration"]) == True
|
||||
|
||||
): # if something to save
|
||||
|
||||
url_path,folder_str=prepare_folder_path(json_content, mllm_model_id,tecnhnique_name="h58")
|
||||
output_dir = prepare_output_folder(url_path, folder_str)
|
||||
|
||||
# Create lang extractor
|
||||
language_extractor = LanguageExtractor(
|
||||
json_content["page_url"],
|
||||
|
||||
)
|
||||
# Extract images
|
||||
logging.info(f"Extracting languages from: {json_content['page_url']}")
|
||||
languages = await language_extractor.extract_content_with_lang_context()
|
||||
print("Extracted languages and textual elements.", languages)
|
||||
main_language="italian"
|
||||
other_textual_elements="ciao casa"
|
||||
|
||||
# MLLM settings
|
||||
mllm_end_point = self.mllm_settings["mllm_end_point"]
|
||||
mllm_api_key = self.mllm_settings["mllm_api_key"]
|
||||
|
||||
logging.info("mllm_end_point:%s", mllm_end_point)
|
||||
logging.info("mllm_model_id:%s", mllm_model_id)
|
||||
|
||||
# Create MLLM manager
|
||||
mllm_manager = MLLMManager(mllm_end_point, mllm_api_key, mllm_model_id)
|
||||
logging.info("mllm_manager.end_point:%s", mllm_manager.end_point)
|
||||
# Make h58 evaluation
|
||||
mllm_responses = mllm_manager.make_h58_evaluation(
|
||||
main_language,
|
||||
other_textual_elements,
|
||||
openai_model=self.mllm_settings["openai_model"],
|
||||
)
|
||||
# Parse MLLM responses
|
||||
for i, response in enumerate(mllm_responses):
|
||||
parsed_resp = response["mllm_response"]#parse_mllm_alt_text_response(response["mllm_response"])
|
||||
mllm_responses[i]["mllm_response"] = parsed_resp
|
||||
|
||||
mllm_responses_object = {
|
||||
"mllm_h58_assessments": mllm_responses,
|
||||
}
|
||||
|
||||
returned_object = {
|
||||
|
||||
"mllm_validations": mllm_responses_object,
|
||||
}
|
||||
|
||||
"""
|
||||
try:
|
||||
# Persist to local db
|
||||
# Convert JSON data to string
|
||||
json_in_str = json.dumps(images, ensure_ascii=False)
|
||||
json_out_str = json.dumps(mllm_responses_object, ensure_ascii=False)
|
||||
db_persistence_insert(
|
||||
connection_db=self.connection_db,
|
||||
insert_type="wcag_alttext_validation",
|
||||
page_url=json_content["page_url"],
|
||||
llm_model=mllm_model_id,
|
||||
json_in_str=json_in_str,
|
||||
json_out_str=json_out_str,
|
||||
table="wcag_validator_results",
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("error persisting to local db: %s", e)
|
||||
"""
|
||||
# save extracted images info
|
||||
if (
|
||||
disclaim_bool_string(json_content["save_elaboration"]) == True
|
||||
): # Optionally save to JSON
|
||||
|
||||
#await image_extractor.save_elaboration(
|
||||
# images, output_dir=output_dir + "/extracted_images.json"
|
||||
#)
|
||||
|
||||
# save mllm responses
|
||||
with open(
|
||||
output_dir + "/mllm_assessments.json", "w", encoding="utf-8"
|
||||
) as f:
|
||||
json.dump(mllm_responses, f, indent=2, ensure_ascii=False)
|
||||
|
||||
return JSONResponse(content=returned_object, status_code=200)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
logging.error(invalid_json_input_msg)
|
||||
return JSONResponse(
|
||||
content={"error": invalid_json_input_msg}, status_code=400
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(unexpected_error_msg + " %s", e)
|
||||
return JSONResponse(
|
||||
content={"error": unexpected_error_msg}, status_code=500
|
||||
)
|
||||
|
|
@ -19,6 +19,8 @@ from restserver.routers import (
|
|||
routes_local_db,
|
||||
routes_wcag_alttext,
|
||||
routes_extract_images,
|
||||
routes_wcag_h58,
|
||||
routes_wcag_g88
|
||||
)
|
||||
|
||||
from dependences.utils import (
|
||||
|
|
@ -42,14 +44,28 @@ def server(connection_db, mllm_settings):
|
|||
|
||||
health_routes = routes_health.HealthRoutes()
|
||||
local_db_routes = routes_local_db.LocalDBRoutes(connection_db)
|
||||
extract_images_routes = routes_extract_images.ExtractImagesRoutes()
|
||||
|
||||
|
||||
wcag_alttext_routes = routes_wcag_alttext.WCAGAltTextValuationRoutes(
|
||||
connection_db, mllm_settings
|
||||
)
|
||||
extract_images_routes = routes_extract_images.ExtractImagesRoutes()
|
||||
wcag_h58_routes = routes_wcag_h58.WCAG_h58ValuationRoutes(
|
||||
connection_db, mllm_settings)
|
||||
|
||||
wcag_g88_routes = routes_wcag_g88.WCAG_g88ValuationRoutes(
|
||||
connection_db, mllm_settings)
|
||||
|
||||
|
||||
|
||||
|
||||
app.include_router(wcag_alttext_routes.router, prefix="")
|
||||
app.include_router(wcag_h58_routes.router, prefix="")
|
||||
app.include_router(wcag_g88_routes.router, prefix="")
|
||||
|
||||
# do not use LLMs
|
||||
app.include_router(health_routes.router, prefix="")
|
||||
app.include_router(local_db_routes.router, prefix="")
|
||||
app.include_router(wcag_alttext_routes.router, prefix="")
|
||||
app.include_router(extract_images_routes.router, prefix="")
|
||||
return app
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue