wcag_AI_validation/restserver/routers/routes_wcag_g88.py

from fastapi import APIRouter, Request
from fastapi.responses import JSONResponse
import logging
from pydantic import BaseModel
import json
import aiofiles
import asyncio

from dependences.utils import (
    disclaim_bool_string,
    prepare_output_folder,
    prepare_folder_path,
    create_folder,
    db_persistence_insert,
)
from dependences.title_content_extractor import PageTitleExtractor
from dependences.mllm_management import MLLMManager, parse_mllm_standard_response

invalid_json_input_msg = "Invalid JSON format"
unexpected_error_msg = "Unexpected Error: could not end the process"


class WCAG_g88Valuation(BaseModel):
    page_url: str = "https://www.bbc.com"
    save_elaboration: str = "True"


class WCAG_g88ValuationRoutes:

    def __init__(self, connection_db, mllm_settings):
        self.connection_db = connection_db
        self.mllm_settings = mllm_settings
        self.router = APIRouter()

        self.router.add_api_route(
            "/wcag_g88_validation",
            self.wcag_g88_validation,
            methods=["POST"],
            tags=["Wcag G88 Validation"],
            description="WCAG validator G88 validation: Providing descriptive titles for web pages",
            name="wcag G88 validation",
            dependencies=[],
        )

        logging.info("wcag g88 routes correctly initialized.")

    async def wcag_g88_validation(
        self, request: Request, data: WCAG_g88Valuation
    ) -> JSONResponse:

        try:
            print("Received wcag G88 validation request.")
            json_content = json.loads(data.model_dump_json())

            if self.mllm_settings["openai_model"] == "Both":

                mllm_model_id_for_logging = (
                    self.mllm_settings["mllm_model_id"]["model_id_remote"]
                    + "&"
                    + self.mllm_settings["mllm_model_id"]["model_id_local"]
                )
            else:
                mllm_model_id_for_logging = self.mllm_settings["mllm_model_id"]

            # prepare output folders if needed---

            if (
                disclaim_bool_string(json_content["save_elaboration"]) == True
            ):  # if something to save

                url_path, folder_str = prepare_folder_path(
                    json_content, mllm_model_id_for_logging, tecnhnique_name="g88"
                )
                output_dir = prepare_output_folder(url_path, folder_str)

            # ---------------------

            # Create title and content extractor
            title_content_extractor = PageTitleExtractor(
                json_content["page_url"], threshold=800
            )
            # Extract title and content
            logging.info(f"Extracting title-content from: {json_content['page_url']}")
            title_content = await title_content_extractor.extract_page_title()
            print("Extracted title_content.", title_content)

            if self.mllm_settings["openai_model"] == "Both":

                from concurrent.futures import ThreadPoolExecutor

                def run_model_evaluation(
                    endpoint, api_key, model_id, openai_model, label
                ):
                    manager = MLLMManager(endpoint, api_key, model_id)
                    print(
                        f"Using {label} model for title evaluation.", manager.end_point
                    )
                    logging.info("mllm_end_point:%s", endpoint)
                    logging.info("mllm_model_id:%s", model_id)

                    responses = manager.make_g88_evaluation(
                        title_content, openai_model=openai_model
                    )

                    parsed_mllm_responses = parse_mllm_standard_response(
                        responses["mllm_response"], model_id=model_id
                    )

                    return parsed_mllm_responses

                with ThreadPoolExecutor(max_workers=2) as executor:
                    future_openai = executor.submit(
                        run_model_evaluation,
                        self.mllm_settings["mllm_end_point"]["model_end_point_remote"],
                        self.mllm_settings["mllm_api_key"]["api_key_remote"],
                        self.mllm_settings["mllm_model_id"]["model_id_remote"],
                        True,
                        "first remote",
                    )
                    future_local = executor.submit(
                        run_model_evaluation,
                        self.mllm_settings["mllm_end_point"]["model_end_point_local"],
                        self.mllm_settings["mllm_api_key"]["api_key_local"],
                        self.mllm_settings["mllm_model_id"]["model_id_local"],
                        False,
                        "second local",
                    )

                    mllm_responses_openai = future_openai.result()
                    mllm_responses_local = future_local.result()

                mllm_responses_object = {
                    "mllm_g88_assessments": {
                        "mllm_g88_assessments_openai": mllm_responses_openai,
                        "mllm_g88_assessments_local": mllm_responses_local,
                    }
                }

            else:
                # MLLM settings
                mllm_end_point = self.mllm_settings["mllm_end_point"]
                mllm_api_key = self.mllm_settings["mllm_api_key"]
                mllm_model_id = self.mllm_settings["mllm_model_id"]

                logging.info("mllm_end_point:%s", mllm_end_point)
                logging.info("mllm_model_id:%s", mllm_model_id)

                # Create MLLM manager
                mllm_manager = MLLMManager(mllm_end_point, mllm_api_key, mllm_model_id)
                print(
                    "Using single model for g88 evaluation.",
                    mllm_manager.end_point,
                )
                # Make g88 evaluation
                mllm_responses = mllm_manager.make_g88_evaluation(
                    title_content=title_content,
                    openai_model=self.mllm_settings["openai_model"],
                )
                parsed_mllm_responses = parse_mllm_standard_response(
                    mllm_responses["mllm_response"], model_id=mllm_model_id
                )

                mllm_responses_object = {"mllm_g88_assessments": parsed_mllm_responses}

            # common: prepare the object to return in the response
            returned_object = {
                "title_content": title_content,
                "mllm_validations": mllm_responses_object,
            }

            try:
                # Persist to local db
                # Convert JSON data to string
                json_in_str = json.dumps(title_content, ensure_ascii=False)
                json_out_str = json.dumps(mllm_responses_object, ensure_ascii=False)

                db_persistence_insert(
                    connection_db=self.connection_db,
                    insert_type="wcag_g88_validation",
                    page_url=json_content["page_url"],
                    llm_model=mllm_model_id_for_logging,
                    json_in_str=json_in_str,
                    json_out_str=json_out_str,
                    table="wcag_validator_results",
                )

            except Exception as e:
                logging.error("error persisting to local db: %s", e)

            if (
                disclaim_bool_string(json_content["save_elaboration"]) == True
            ):  # Optionally save to JSON

                # save mllm input and responses
                """with open(
                    output_dir + "/mllm_assessments.json", "w", encoding="utf-8"
                ) as f:
                    json.dump(returned_object, f, indent=2, ensure_ascii=False)"""

                # async version
                async with aiofiles.open(
                    output_dir + "/mllm_assessments.json", "w", encoding="utf-8"
                ) as f:
                    await f.write(
                        json.dumps(returned_object, indent=2, ensure_ascii=False)
                    )

            return JSONResponse(content=returned_object, status_code=200)

        except json.JSONDecodeError:
            logging.error(invalid_json_input_msg)
            return JSONResponse(
                content={"error": invalid_json_input_msg}, status_code=400
            )

        except Exception as e:
            logging.error(unexpected_error_msg + " %s", e)
            return JSONResponse(
                content={"error": unexpected_error_msg}, status_code=500
            )