From 3edc9bb20473f3e21bff9cc2ed8288d897bbed3a Mon Sep 17 00:00:00 2001 From: Arianna Di Serio Date: Tue, 3 Mar 2026 14:54:40 +0100 Subject: [PATCH] Upload files to "/" --- annotation.py | 363 ++++++++++++++++++++++++++++++++++++++++++++++++++ prompt.py | 288 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 651 insertions(+) create mode 100644 annotation.py create mode 100644 prompt.py diff --git a/annotation.py b/annotation.py new file mode 100644 index 0000000..2c7b3d5 --- /dev/null +++ b/annotation.py @@ -0,0 +1,363 @@ +# --- Import librerie --- +import pandas as pd +from openai import AzureOpenAI +import pickle +from sentence_transformers import SentenceTransformer +import numpy as np +import faiss +import openpyxl +import re +import json +from openpyxl.styles import PatternFill +from openpyxl import load_workbook +from collections import Counter +from prompts.prompt import build_prompt_local +import warnings +import logging + + +# --- Configurazione --- +endpoint = "https://gpt-sw-central-tap-security.openai.azure.com/" +deployment = "gpt-4o" +subscription_key = "8zufUIPs0Dijh0M6NpifkkDvxJHZMFtott7u8V8ySTYNcpYVoRbsJQQJ99BBACfhMk5XJ3w3AAABACOGr6sq" + +client = AzureOpenAI( + azure_endpoint=endpoint, + api_key=subscription_key, + api_version="2024-05-01-preview", +) + +# ----- Step 1: caricare datasets ----- +df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=';') # colonne: automation, description, category, subcategory, problem_type, gravity +df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep='\t', encoding='utf-8') +print("***STEP 1***\nDataset etichettato caricato. Numero righe:", len(df_labeled), "\nDataset non etichettato caricato. Numero righe:", len(df_unlabeled)) + +def clean_id(x): + if pd.isna(x): + return "" + s = str(x) + m = re.search(r"\d+", s) # prima sequenza di cifre + return m.group(0) if m else s.strip() + +df_labeled["automation_id"] = df_labeled["automation_id"].apply(clean_id) +df_unlabeled["automation_id"] = df_unlabeled["automation_id"].apply(clean_id) +df_labeled["folder"] = df_labeled["folder"].astype(str).str.strip() +df_unlabeled["folder"] = df_unlabeled["folder"].astype(str).str.strip() + +labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"])) # Crea set di coppie già etichettate + +df_unlabeled_filtered = df_unlabeled[ + ~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1) # Filtra il dataset non etichettato +] +print("Automazioni non etichettate rimanenti dopo la pulizia:", len(df_unlabeled_filtered)) + + +# --- Step 2: embeddings --- +# Silenzia warning generici +warnings.filterwarnings("ignore") +# Silenzia logging di transformers / sentence-transformers / HF hub +logging.getLogger("sentence_transformers").setLevel(logging.ERROR) +logging.getLogger("transformers").setLevel(logging.ERROR) +logging.getLogger("huggingface_hub").setLevel(logging.ERROR) + +print("\n***Step 2 ***\nEmbeddings") +model = SentenceTransformer("all-MiniLM-L6-v2") + +with open("main/labeled_embeddings.pkl", "rb") as f: + data = pickle.load(f) + +embeddings = data['embeddings'].astype("float32") +print("Shape embeddings:", embeddings.shape) + + +# ----- Step3: Creazione indice FAISS --- +dimension = embeddings.shape[1] +index = faiss.IndexFlatL2(dimension) # indice L2 (distanza Euclidea) +index.add(embeddings) +print(f"\n***Step 3: Indice FAISS creato***. \nNumero di vettori nell'indice: {index.ntotal}") + + +# ----- Step4: Retrieval (similarità) --- +# Prova con le prime 500 automazioni non annotate +k = 5 +output_rows = [] +df_sample = df_unlabeled_filtered.head(500) +llm_rows = [] + +def sim_label(distance: float) -> str: + if distance <= 0.50: + return "Match forte" + elif distance <= 0.75: + return "Match plausibile" + elif distance <= 0.90: + return "Similarità instabile" + else: + return "Troppo distante" + +for i, row in df_sample.iterrows(): + query_text = str(row["human_like"]) + print("numero corrente:", i) + + # Calcolo embedding della nuova automazione + query_emb = model.encode([query_text], convert_to_numpy=True).astype("float32") + # Recupera indici dei k vicini più prossimi + distances, indices = index.search(query_emb, k) + + # Metriche globali sui top-k (una volta per automazione) + topk_cats = [] + top1_distance = float(distances[0][0]) + top1_confidence = 1 / (1 + top1_distance) + top1_similarity_label = sim_label(top1_distance) + + for rank in range(k): + idx = int(indices[0][rank]) + distance = float(distances[0][rank]) + confidence = 1 / (1 + distance) + label = sim_label(distance) + + retrieved_row = df_labeled.iloc[idx] + topk_cats.append(str(retrieved_row["category"])) + + rank1_category = topk_cats[0] if topk_cats else "" + majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else "" + consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0 + + for rank in range(k): + idx = int(indices[0][rank]) + distance = float(distances[0][rank]) + confidence = 1 / (1 + distance) + label = sim_label(distance) + + retrieved_row = df_labeled.iloc[idx] + + output_rows.append({ + # query + "automazione da etichettare": query_text, + + # info retrieval per questa riga + "rank": rank + 1, + "retrieved_idx": idx, + "automazione simile": retrieved_row["automation"], + "categoria automazione simile": retrieved_row["category"], + "distanza": distance, + "confidence": round(confidence, 4), + "similarity": label, + + # metriche aggregate top-k (ripetute su ogni riga) + "rank1_distance": top1_distance, + "rank1_confidence": round(top1_confidence, 4), + "rank1_similarity_label": top1_similarity_label, + "rank1_category": rank1_category, + "majority_category": majority_category, + "consistency": round(consistency, 3), + "top5_categories": " | ".join(topk_cats) + }) + + + # --- Step5: invio dati al LLM --- + # (1) Costruzione prompt + retrieved = df_labeled.iloc[indices[0]].copy() + retrieved["distance"] = distances[0].astype(float) + retrieved["confidence"] = retrieved["distance"].apply(lambda d: 1 / (1 + float(d))) + retrieved["similarity"] = retrieved["distance"].apply(sim_label) + prompt = build_prompt_local(query_text, retrieved, sim_label) + + # (2) Chiamata al modello: restituisce JSON + resp = client.chat.completions.create( + model=deployment, + messages=[ + {"role": "system", "content": "Return ONLY valid JSON. No extra text."}, + {"role": "user", "content": prompt}, + ], + temperature=0 + ) + content = resp.choices[0].message.content.strip() + + # (3) Parsing della risposta + try: + parsed = json.loads(content) + except Exception: + parsed = { + "automation": query_text, + "category": "", + "subcategory": "", + "problem_type": "", + "gravity": "", + "scores": {}, + "needs_human_review": True, + "short_rationale": f"JSON_PARSE_ERROR: {content[:200]}" + } + + # (4) Salvataggio di 1 riga per automazione con: + # - metriche retrieval (rank1/majority/consistency) + # - output dell'LLM (scores + label finale + review flag) + llm_category = parsed.get("category", "") + llm_subcategory = parsed.get("subcategory", "") + llm_problem_type = parsed.get("problem_type", "") + llm_gravity = parsed.get("gravity", "") + final_category = llm_category + final_subcategory = llm_subcategory + final_problem_type = llm_problem_type + final_gravity = llm_gravity + if llm_category.strip().upper() == "HARMLESS": + llm_subcategory = "" + llm_problem_type = "NONE" + llm_gravity = "NONE" + + # ================= HUMAN REVIEW LOGIC ================= + needs_human_review = bool(parsed.get("needs_human_review", True)) + OVERRIDE_MAX_DISTANCE = 0.90 + OVERRIDE_MIN_CONSISTENCY = 0.60 + # Allineamento forte: LLM = majority = top1 + aligned_strong = ( + llm_category == majority_category and + llm_category == rank1_category and + llm_category != "" + ) + # distanza non eccessiva e buona consistency + good_retrieval = ( + top1_distance <= OVERRIDE_MAX_DISTANCE and + consistency >= OVERRIDE_MIN_CONSISTENCY + ) + # allora NON richiede revisione anche se il modello aveva messo True + if aligned_strong and good_retrieval: + needs_human_review = False + # ===================================================== + + + llm_rows.append({ + "automation_id": row.get("automation_id", ""), + "folder": row.get("folder", ""), + "automation_text": query_text, + + "rank1_distance": top1_distance, + "rank1_confidence": round(top1_confidence, 4), + "rank1_similarity_label": top1_similarity_label, + "rank1_category": rank1_category, + "majority_category": majority_category, + "consistency": round(consistency, 3), + "top5_categories": " | ".join(topk_cats), + + "llm_category": llm_category, + "llm_subcategory": llm_subcategory, + "llm_problem_type": llm_problem_type, + "llm_gravity": llm_gravity, + "llm_needs_human_review": parsed.get("needs_human_review", True), + "final_needs_human_review": needs_human_review, + + "final_category": final_category, + "final_subcategory": final_subcategory, + "final_problem_type": final_problem_type, + "final_gravity": final_gravity, + + "llm_rationale": parsed.get("short_rationale", "") + }) + + +# --- Step6: integrazione e output --- +# (5) Esportare l’output finale come dataframe +df_llm = pd.DataFrame(llm_rows) +out_path = "main/datasets/labeling_first500.xlsx" +df_llm.to_excel(out_path, index=False) + +wb = load_workbook(out_path) +ws = wb.active + +# Colori per needs_human_review +true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso +false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde +col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)} +if "llm_needs_human_review" in col_index: + c = col_index["llm_needs_human_review"] + for r in range(2, ws.max_row + 1): + val = ws.cell(row=r, column=c).value + if val is True: + ws.cell(row=r, column=c).fill = true_fill + elif val is False: + ws.cell(row=r, column=c).fill = false_fill + +if "final_needs_human_review" in col_index: + c = col_index["final_needs_human_review"] + for r in range(2, ws.max_row + 1): + val = ws.cell(row=r, column=c).value + if val is True: + ws.cell(row=r, column=c).fill = true_fill + elif val is False: + ws.cell(row=r, column=c).fill = false_fill + +wb.save(out_path) +print(f"\n***Step 6: Retrieval e LLM ***\nExcel LLM salvato in {out_path}") + +# --- Conteggio needs_human_review --- +review_counts = df_llm["final_needs_human_review"].value_counts(dropna=False) +true_count = review_counts.get(True, 0) +false_count = review_counts.get(False, 0) +print("\n--- Needs human review summary ---") +print(f"needs_human_review = True : {true_count}") +print(f"needs_human_review = False: {false_count}") + + +# --- Step7: dataset finale su tutte le automazioni (solo testo + etichette) --- +df_final = df_llm[[ + "automation_text", + "llm_category", + "llm_subcategory", + "llm_gravity", + "llm_problem_type", + "final_needs_human_review" +]].rename(columns={ + "llm_category": "category", + "llm_subcategory": "subcategory", + "llm_gravity": "gravity", + "llm_problem_type": "problem_type" +}) + +# Normalizza stringhe +for col in ["category", "subcategory", "gravity", "problem_type"]: + df_final[col] = df_final[col].fillna("").astype(str).str.strip() + + + + + + + + + + + + +# Creazione DataFrame risultati +# df_results = pd.DataFrame(output_rows) +# output_path = "main/datasets/similarity_analysis.xlsx" +# df_results.to_excel(output_path, index=False) + +#wb = load_workbook(output_path) +#ws = wb.active + +#distanza_col_idx = None +#for idx, cell in enumerate(ws[1], start=1): + #if cell.value == "distanza": + #distanza_col_idx = idx + #break +#if distanza_col_idx is None: + #raise ValueError("Colonna 'distanza' non trovata!") + +# Applichiamo i colori in base al valore +#for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=distanza_col_idx, max_col=distanza_col_idx): + #cell = row[0] + #try: + #val = float(cell.value) + #if val < 0.5: + #color = "90EE90" # verde chiaro + #elif val < 1.0: + #color = "FFFF00" # giallo + #else: + #color = "FF6347" # rosso + #cell.fill = PatternFill(start_color=color, end_color=color, fill_type="solid") + #except: + #continue + +# Salva il file direttamente con colori applicati +#wb.save(output_path) +#print(f"Excel salvato in {output_path}") \ No newline at end of file diff --git a/prompt.py b/prompt.py new file mode 100644 index 0000000..faa02a5 --- /dev/null +++ b/prompt.py @@ -0,0 +1,288 @@ +task = """ +You are a security evaluation tool for smart home automation rules. +Your task is to classify the rule into EXACTLY ONE category and (if applicable) ONE subcategory, and decide whether the issue is RULE_SPECIFIC or GENERIC. + +You will receive: +- The automation rule (text). +- Definitions of 4 categories (with subcategories). +- Retrieved similar labeled examples (RAG context) with distance-based similarity levels. + +Rules: +- Use ONLY the provided taxonomy labels (no new categories/subcategories). +- If retrieved examples are weak/unstable or the rule is ambiguous, set needs_human_review=true. +- Return ONLY a valid JSON object (no extra text). +""" + +taxonomy = """ +* CATEGORIES AND DEFINITIONS (use ONLY these labels) * + + # CATEGORY 1: PERSONAL PRIVACY VIOLATION + Definition: This category includes automations that collect, process, or disseminate personal or sensitive data (own or third-party) in an unnecessary, unintentional, or potentially illicit manner. + Necessary rule conditions: the rule collects sensitive information (images, video, audio, location, identity). + Risk Conditions: + R1: The data is shared with external parties + - neighbors + - WhatsApp groups + - social media + - mailing lists + - public channels + R2: The data is made publicly accessible + - publicly visible + - public link + - open profile + R3: The collection or sharing is not necessary for the stated purpose or is not clearly understood + - no confirmation + - no review before publication + R4: The data collection may violate privacy regulations + - camera photographing passers-by + - audio recording of guests + Do not apply if: + - The data remains strictly internal and is not shared, exposed, or externally observable. + - The automation only updates internal system states without transmitting or publishing personal data. + - The data collection is clearly necessary for the rule's primary functionality and remains locally processed. + Illustrative examples (for guidance only): + - When the camera on my smart doorbell detects an unknown/suspicious person (e.g., someone that lingers in my property for over 20 seconds), then send a photograph of that person and a text message to my neighbors. S+ + - If I take a new photo, then post it as a publicly accessible image on Instagram + + # CATEGORY 2: PHYSICAL ENVIRONMENT THREAT + Definition: This category includes automations that can cause unauthorized access, reduced physical security, or property damage. + Sub-categories: 2.1 PROMOTE UNAUTHORIZED ACCESS, 2.2 Device and identity control. 2.3 , 2.4 absence status reporting + + ## SUB-CATEGORY 2.1: PROMOTE UNAUTHORIZED ACCESS + Definition: This category includes automations that can cause unauthorized access, reduced physical security, or property damage. + Necessary rule conditions: + - Actions on: windows / doors / locks + - Automatic activations based on: environmental conditions / unauthenticated events + Risk Conditions: + - The action reduces physical protection. + - There is no identity check. + - The event can be externally induced. + Does not apply if: + - There are already security measures such as checking the user's presence at home. + - The rule only modifies non-security-related elements (e.g., lights, temperature). + - The action is manually confirmed before execution. + Illustrative examples (for guidance only):: + - When the smart thermostat detects that the temperature rises above 25 degrees, then slightly open the window. + - If Indoor CO2 goes up, open the window. + + + ## SUB-CATEGORY 2.2: Device and identity control (device-based access) + Definition: Automations that grant physical access based solely on the presence of a device, without considering theft, compromise, or old, unremoved devices. + Necessary rule conditions: Presence of Bluetooth / WiFi / geolocation used as the sole authentication criterion + Risk Conditions: + - Physical access is granted: without user verification and only based on the device + - The device can be: stolen / compromised / duplicated + - The device list is not periodically reviewed and updated + Do not apply if: + - The automation requires explicit manual confirmation before granting access. + - Additional authentication mechanisms are enforced (e.g., PIN, biometric verification, multi-factor authentication). + - The device presence is not the sole authentication factor. + - The rule does not grant physical access but only sends notifications or status updates. + Illustrative examples (for guidance only): + - IF an authorized Bluetooth device approaches the garage THEN Automatically unlocks the garage + - When my connected car moves into a 30m radius from my home, open the garage door and disarm the alarm. + - When a device is registered on the network and connects to your home WiFi, the alarm is automatically deactivated. + + ## SUB-CATEGORY 2.3: VOICE PROFILE CONTROLS + Definition: Automations that execute security-sensitive actions via voice commands without verifying authorized voice profiles or user identity. + Necessary rule conditions: + - The automation is triggered by a voice command. + - The command affects security-sensitive actions (e.g., unlocking, disarming, disabling protections). + Risk Conditions: + - The command can be executed by anyone + - There is no control over the user's identity + - No specific authorized voice profiles are stored + Do not apply if: + - The voice command triggers only non-security-sensitive actions (e.g., turning lights on/off). + - The automation can only be executed from inside the home after physical access has already been established. + - The system verifies authorized voice profiles before executing critical actions. + - Additional authentication mechanisms are required for security-sensitive operations. + Illustrative examples (for guidance only): + - IF the voice assistant recognizes the command "Disable alarm" THEN Disable the home security system + + + ## SUB-CATEGORY 2.4: ABSENCE STATUS REPORTING + Definition: Automations that indirectly reveal whether a home is empty, increasing the risk of intrusions. + Necessary rule conditions: + - Actions that: turn lights on/off; modify Wi-Fi/alarms + - The actions are related to presence at home + Risk Conditions: + - The rule allows us to deduce whether the house is empty. + - The information is: observable from the outside or shared with third parties. + Do not apply if: + - The automation is not externally observable. + - The information is not shared outside the household. + - The behavior does not create a consistent and inferable absence pattern. + - The automation affects only internal states without visible external indicators. + Illustrative examples (for guidance only): + - IF someone is home, THEN turn the light + - If the last family member leaves home, then turn off lights + - IF I leave home, THEN turn off the WiFi + - If I'm the last person in the house and leave, send a notification to my smartwatch if any window in the house is open + + + # CATEGORY3: CYBERSECURITY HARM + Description: This category includes automations that introduce malware exposure, data stream manipulation, or network communication abuse. + Sub-categories: SUB-CATEGORY 3.1: MALICIOUS TRAFFIC GENERATION, SUB-CATEGORY 3.2: AUTOMATIC FILE SPREAD, SUB-CATEGORY 3.3: NETWORK COMMUNICATION THREATS + + ## SUB-CATEGORY 3.1: MALICIOUS TRAFFIC GENERATION + Definition: Automations that can be exploited to generate excessive traffic, false alarms, or denial of service + Rule conditions: The event is easily repeatable. + Risk conditions: + - The event can generate: excessive traffic / false alarms + - The event is manipulable + Do not apply if: + - The event cannot be externally triggered or manipulated. + - The notification is rate-limited or protected against repeated activation. + - The action does not significantly increase network load or system exposure. + Illustrative examples (for guidance only): + - If the smart camera detects someone approaching, it automatically sends a text message to my phone + + ## SUB-CATEGORY 3.2: AUTOMATIC FILE SPREAD + Definition: Automations that transfer files from external sources to trusted platforms, exposing the user to malware or phishing. + Rule conditions: + The automation involves automatic file download from: + - external URLs + - email attachments + - messaging platforms + - third-party APIs + The automation stores, uploads, forwards, or makes the file available within: + - trusted cloud storage + - local systems + - shared folders + - collaboration platforms + Risk conditions: + - There is no content check (e.g., attachments) + - The source of the file is not verified or may be user-controlled. + - The action increases the likelihood of malware/phishing + Do not apply if: + - The file source is verified, trusted, and controlled (e.g., official governmental APIs). + - The automation includes content validation or malware scanning before storage. + - The user manually confirms the download before execution. + - No file propagation to additional platforms occurs. + Illustrative examples (for guidance only): + - Add file from URL action from the Dropbox channel when the “Any new attachment in inbox ” trigger from the Gmail channel is activated + + ## SUB-CATEGORY 3.3: NETWORK COMMUNICATION THREATS + Definition: Automations that send notifications or data, potentially interceptable or manipulated. + Rule conditions: + - The automation sends data or notifications over: SMS, messaging platforms, email- + - The transmitted information relates to security-relevant events, such as absence of occupants, alarm status, door/window state. + Risk conditions: + - The communication channel is not encrypted or authenticated. + - Messages can be intercepted, spoofed, or altered in transit. + Do not apply if: + - The communication is encrypted and authenticated. + - The communication does not expose the system to interception or spoofing risks. + - The transmitted data does not expose occupancy, alarm status, or access control states. + Illustrative examples (for guidance only): + - If the smart camera detects someone approaching, it automatically sends a text message to my phone + + # CATEGORY 4: HARMLESS + Definition: automations that do not present safety problems. + Conditions: + - The rule does not involve personal data + - The rule does not modify the physical environment + - The rule does not introduce risky network communications + - The rule already includes device/user/presence checks + Illustrative examples (for guidance only): + - If it rains tomorrow, then remind me to bring an umbrella +""" + +problem_type_guide = """ +* PROBLEM TYPE (choose exactly one) *: + +# RULE_SPECIFIC (S): the automation directly leads to a potentially dangerous situation. +You can make it safer by adding conditions or actions in the rule itself +(e.g., verifying presence at home, identity check, confirmation step). +Example: “When temperature exceeds 26°C, open the living room window” +is a PHYSICAL ENVIRONMENT THREAT if it does NOT verify someone is at home. + +# GENERIC (G): the automation is not inherently dangerous; risk depends on configuration +or contextual factors. The best mitigation is a user behavior recommendation rather +than changing the rule logic. +Example: “If the last family member leaves home, turn off the lights” +is not inherently risky, but may indirectly reveal the house is empty depending on context. +""" + +gravity_guide = """ +* GRAVITY / SEVERITY (choose exactly one) *: + + # HIGH: direct and immediate security/privacy consequence. + Examples: automatically opening doors; public photos without consent; malware propagation. + + # MEDIUM: indirect consequence or conditioned on other variables. + Examples: absence deducible from light patterns; opening door via Bluetooth/device proximity. + + # LOW: minimal risk, marginal information leakage, or easily mitigable. + Examples: notifications that might hint the user is away only if intercepted; + downloads from relatively trusted sources with limited exposure. + + # NONE: no security/privacy consequence (comfort rules). + Examples: lights/temperature/irrigation/morning routine. +""" + +OUTPUT_SCHEMA = """ +Return ONLY this JSON: + +{ + "automation": "string", + "category": "PERSONAL PRIVACY VIOLATION | PHYSICAL ENVIRONMENT THREAT | CYBERSECURITY HARM | HARMLESS", + "subcategory": "one of the defined subcategories for that category, or empty string", + "problem_type": "RULE_SPECIFIC | GENERIC | none", + "gravity": "LOW | MEDIUM | HIGH | NONE", + "scores": { + "PERSONAL PRIVACY VIOLATION": 0.0, + "PHYSICAL ENVIRONMENT THREAT": 0.0, + "CYBERSECURITY HARM": 0.0, + "HARMLESS": 0.0 + }, + "needs_human_review": true, + "short_rationale": "max 2 sentences" +} +""" + +# trasformare in testo i risultati del retrieval (le 5 automazioni simili + distanza) +# il testo viene passato al LLM come esempio +def build_examples_text(retrieved_df, distance_band_fn, max_chars=600): + parts = [] + for i, r in enumerate(retrieved_df.iterrows(), start=1): + _, r = r + d = float(r["distance"]) + parts.append( + f"""Example {i}: + Automation: {str(r.get('automation',''))[:max_chars]} + Description: {str(r.get('description',''))[:200]} + Category: {r.get('category','')} + Subcategory: {r.get('subcategory','')} + Problem type: {r.get('problem_type','')} + Gravity: {r.get('gravity','')} + Distance: {d} + Similarity level: {distance_band_fn(d)} + """ + ) + return "\n".join(parts) + +# costruzione del prompt +def build_prompt_local(query_text, retrieved_df, distance_band_fn): + top1_dist = float(retrieved_df["distance"].iloc[0]) + band = distance_band_fn(top1_dist) + examples_text = build_examples_text(retrieved_df, distance_band_fn) + + return f"""{task} + +{taxonomy} +{problem_type_guide} +{gravity_guide} + +AUTOMATION TO LABEL: +{query_text} + +TOP1_DISTANCE: {top1_dist} +SIMILARITY_BAND: {band} + +RETRIEVED SIMILAR LABELED EXAMPLES (top-k): +{examples_text} + +{OUTPUT_SCHEMA} +""" \ No newline at end of file