Upload files to "/"

2026-03-06 11:31:07 +01:00 · 2026-03-06 11:31:07 +01:00 · 3a1060e4b2
parent 9c8050d48e
commit 3a1060e4b2
1 changed files with 108 additions and 176 deletions
--- a/annotation.py
+++ b/annotation.py
@ -28,15 +28,15 @@ client = AzureOpenAI(
 )

 # ----- Step 1: caricare datasets -----
-df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=';')  # colonne: automation, description, category, subcategory, problem_type, gravity
-df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep='\t', encoding='utf-8')
+df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=";")
+df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep="\t", encoding="utf-8")
 print("***STEP 1***\nDataset etichettato caricato. Numero righe:", len(df_labeled), "\nDataset non etichettato caricato. Numero righe:", len(df_unlabeled))

 def clean_id(x):
    if pd.isna(x):
        return ""
    s = str(x)
-    m = re.search(r"\d+", s)  # prima sequenza di cifre
+    m = re.search(r"\d+", s)
    return m.group(0) if m else s.strip()

 df_labeled["automation_id"] = df_labeled["automation_id"].apply(clean_id)
@ -44,15 +44,14 @@ df_unlabeled["automation_id"] = df_unlabeled["automation_id"].apply(clean_id)
 df_labeled["folder"] = df_labeled["folder"].astype(str).str.strip()
 df_unlabeled["folder"] = df_unlabeled["folder"].astype(str).str.strip()

-labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"])) # Crea set di coppie già etichettate
-
+labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"]))
 df_unlabeled_filtered = df_unlabeled[
-    ~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1) # Filtra il dataset non etichettato
+    ~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1)
 ]
 print("Automazioni non etichettate rimanenti dopo la pulizia:", len(df_unlabeled_filtered))


-# --- Step 2: embeddings ---
+# ----- Step 2: embeddings -----
 # Silenzia warning generici
 warnings.filterwarnings("ignore")
 # Silenzia logging di transformers / sentence-transformers / HF hub
@ -60,116 +59,112 @@ logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
 logging.getLogger("transformers").setLevel(logging.ERROR)
 logging.getLogger("huggingface_hub").setLevel(logging.ERROR)

-print("\n***Step 2 ***\nEmbeddings")
+print("\n***Step 2***\nEmbeddings")
 model = SentenceTransformer("all-MiniLM-L6-v2")

 with open("main/labeled_embeddings.pkl", "rb") as f:
    data = pickle.load(f)

-embeddings = data['embeddings'].astype("float32")
+embeddings = data["embeddings"].astype("float32")
 print("Shape embeddings:", embeddings.shape)


+
 # ----- Step3: Creazione indice FAISS ---
-dimension = embeddings.shape[1]  
-index = faiss.IndexFlatL2(dimension)  # indice L2 (distanza Euclidea)
+faiss.normalize_L2(embeddings)
+dimension = embeddings.shape[1]
+index = faiss.IndexFlatIP(dimension)
 index.add(embeddings)
-print(f"\n***Step 3: Indice FAISS creato***. \nNumero di vettori nell'indice: {index.ntotal}")
+print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}")


-# ----- Step4: Retrieval (similarità) ---
-# Prova con le prime 500 automazioni non annotate
+# ----- Step 4: Retrieval (similarità cosine) -----
 k = 5
 output_rows = []
-df_sample = df_unlabeled_filtered.head(500)
+df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True)
 llm_rows = []

-def sim_label(distance: float) -> str:
-    if distance <= 0.50:
+
+def sim_label(sim: float) -> str:
+    # più alto = più simile
+    if sim >= 0.80:
        return "Match forte"
-    elif distance <= 0.75:
+    elif sim >= 0.60:
        return "Match plausibile"
-    elif distance <= 0.90:
+    elif sim >= 0.50:
        return "Similarità instabile"
    else:
-        return "Troppo distante"
+        return "Debole"

-for i, row in df_sample.iterrows():
+for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
    query_text = str(row["human_like"])
-    print("numero corrente:", i)
+    print("automazione analizzata:", count)

    # Calcolo embedding della nuova automazione
    query_emb = model.encode([query_text], convert_to_numpy=True).astype("float32")
+    faiss.normalize_L2(query_emb)
+
    # Recupera indici dei k vicini più prossimi
-    distances, indices = index.search(query_emb, k)
+    sims, indices = index.search(query_emb, k)

    # Metriche globali sui top-k (una volta per automazione)
    topk_cats = []
-    top1_distance = float(distances[0][0])
-    top1_confidence = 1 / (1 + top1_distance)
-    top1_similarity_label = sim_label(top1_distance)
+    top1_sim = float(sims[0][0])
+    top1_similarity_label = sim_label(top1_sim)

    for rank in range(k):
        idx = int(indices[0][rank])
-        distance = float(distances[0][rank])
-        confidence = 1 / (1 + distance)
-        label = sim_label(distance)
+        sim = float(sims[0][rank])

        retrieved_row = df_labeled.iloc[idx]
-        topk_cats.append(str(retrieved_row["category"]))
+        topk_cats.append(str(retrieved_row.get("category", "")))

    rank1_category = topk_cats[0] if topk_cats else ""
    majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
    consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
+    print(consistency)

    for rank in range(k):
        idx = int(indices[0][rank])
-        distance = float(distances[0][rank])
-        confidence = 1 / (1 + distance)
-        label = sim_label(distance)
+        sim = float(sims[0][rank])
+        label = sim_label(sim)

        retrieved_row = df_labeled.iloc[idx]

        output_rows.append({
-            # query
            "automazione da etichettare": query_text,
-
            # info retrieval per questa riga
            "rank": rank + 1,
            "retrieved_idx": idx,
-            "automazione simile": retrieved_row["automation"],
-            "categoria automazione simile": retrieved_row["category"],
-            "distanza": distance,
-            "confidence": round(confidence, 4),
-            "similarity": label,
-
+            "automazione simile": retrieved_row.get("automation", ""),
+            "categoria automazione simile": retrieved_row.get("category", ""),
+            "similarita_cosine": sim,
+            "similarity_label": label,
            # metriche aggregate top-k (ripetute su ogni riga)
-            "rank1_distance": top1_distance,
-            "rank1_confidence": round(top1_confidence, 4),
+            "rank1_similarity": top1_sim,
            "rank1_similarity_label": top1_similarity_label,
            "rank1_category": rank1_category,
            "majority_category": majority_category,
            "consistency": round(consistency, 3),
-            "top5_categories": " | ".join(topk_cats)
+            "top5_categories": " | ".join(topk_cats),
        })
-    

-    # --- Step5: invio dati al LLM ---
+
+    # ----- Step 5: invio dati al LLM -----
    # (1) Costruzione prompt
    retrieved = df_labeled.iloc[indices[0]].copy()
-    retrieved["distance"] = distances[0].astype(float)
-    retrieved["confidence"] = retrieved["distance"].apply(lambda d: 1 / (1 + float(d)))
-    retrieved["similarity"] = retrieved["distance"].apply(sim_label)
+    retrieved["similarity"] = sims[0].astype(float)
+    retrieved["similarity_label"] = retrieved["similarity"].apply(sim_label)
    prompt = build_prompt_local(query_text, retrieved, sim_label)

    # (2) Chiamata al modello: restituisce JSON
    resp = client.chat.completions.create(
        model=deployment,
        messages=[
-            {"role": "system", "content": "Return ONLY valid JSON. No extra text."},
-            {"role": "user", "content": prompt},
+           {"role": "system", "content": prompt},
+           {"role": "user", "content": f'automation to evaluate: {query_text}'}
        ],
-        temperature=0
+        temperature=0,
    )
    content = resp.choices[0].message.content.strip()

@ -185,43 +180,51 @@ for i, row in df_sample.iterrows():
            "gravity": "",
            "scores": {},
            "needs_human_review": True,
-            "short_rationale": f"JSON_PARSE_ERROR: {content[:200]}"
+            "short_rationale": f"JSON_PARSE_ERROR: {content[:200]}",
        }

    # (4) Salvataggio di 1 riga per automazione con:
    # - metriche retrieval (rank1/majority/consistency)
    # - output dell'LLM (scores + label finale + review flag)
-    llm_category = parsed.get("category", "")
-    llm_subcategory = parsed.get("subcategory", "")
-    llm_problem_type = parsed.get("problem_type", "")
-    llm_gravity = parsed.get("gravity", "")
+    llm_category = str(parsed.get("category", "")).strip()
+    llm_subcategory = str(parsed.get("subcategory", "")).strip()
+    llm_problem_type = str(parsed.get("problem_type", "")).strip()
+    llm_gravity = str(parsed.get("gravity", "")).strip()
+    if llm_category.upper() == "HARMLESS":
+        llm_subcategory = ""
+        llm_problem_type = "none"
+        llm_gravity = "NONE"
+    # di default l'etichetta assegnata è quella del LLM - rivista se review=true 
    final_category = llm_category
    final_subcategory = llm_subcategory
    final_problem_type = llm_problem_type
    final_gravity = llm_gravity
-    if llm_category.strip().upper() == "HARMLESS":
-        llm_subcategory = ""
-        llm_problem_type = "NONE"
-        llm_gravity = "NONE"

-    # ================= HUMAN REVIEW LOGIC =================
-    needs_human_review = bool(parsed.get("needs_human_review", True))
-    OVERRIDE_MAX_DISTANCE = 0.90
-    OVERRIDE_MIN_CONSISTENCY = 0.60
-    # Allineamento forte: LLM = majority = top1
-    aligned_strong = (
-        llm_category == majority_category and
-        llm_category == rank1_category and
-        llm_category != ""
-    )
-    # distanza non eccessiva e buona consistency 
-    good_retrieval = (
-        top1_distance <= OVERRIDE_MAX_DISTANCE and
-        consistency >= OVERRIDE_MIN_CONSISTENCY
-    )
-    # allora NON richiede revisione anche se il modello aveva messo True
-    if aligned_strong and good_retrieval:
+    
+    if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile":
+        needs_human_review = True
+    else:
        needs_human_review = False
+
+
+    final_needs_human_review = needs_human_review
+    # ================= HUMAN REVIEW LOGIC =================
+    aligned_strong = (
+        llm_category == majority_category
+        and llm_category == rank1_category
+        and llm_category != ""
+    )
+
+    OVERRIDE_MIN_SIMILARITY = 0.38
+    OVERRIDE_MIN_CONSISTENCY = 0.60
+
+    good_retrieval = (
+        top1_sim >= OVERRIDE_MIN_SIMILARITY
+        and consistency >= OVERRIDE_MIN_CONSISTENCY
+    )
+    
+    if aligned_strong and good_retrieval:
+        final_needs_human_review = False
    # =====================================================


@ -230,134 +233,63 @@ for i, row in df_sample.iterrows():
        "folder": row.get("folder", ""),
        "automation_text": query_text,

-        "rank1_distance": top1_distance,
-        "rank1_confidence": round(top1_confidence, 4),
+        # Retrieval metrics
+        "rank1_similarity": top1_sim,
        "rank1_similarity_label": top1_similarity_label,
        "rank1_category": rank1_category,
        "majority_category": majority_category,
        "consistency": round(consistency, 3),
        "top5_categories": " | ".join(topk_cats),
-        
+
+        # LLM
        "llm_category": llm_category,
        "llm_subcategory": llm_subcategory,
        "llm_problem_type": llm_problem_type,
        "llm_gravity": llm_gravity,
-        "llm_needs_human_review": parsed.get("needs_human_review", True),
-        "final_needs_human_review": needs_human_review,

+        "needs_review": needs_human_review,
+        "final_needs_review": final_needs_human_review,
+
+        # FINAL
        "final_category": final_category,
        "final_subcategory": final_subcategory,
        "final_problem_type": final_problem_type,
        "final_gravity": final_gravity,

-        "llm_rationale": parsed.get("short_rationale", "")
+        "llm_rationale": parsed.get("short_rationale", ""),
    })


-# --- Step6: integrazione e output ---
-# (5) Esportare l’output finale come dataframe
-df_llm = pd.DataFrame(llm_rows)
-out_path = "main/datasets/labeling_first500.xlsx"
-df_llm.to_excel(out_path, index=False)
+# ----- Step 6: output Excel -----
+df_out = pd.DataFrame(llm_rows)
+out_path = "main/datasets/labeling_first50.xlsx"
+df_out.to_excel(out_path, index=False)

 wb = load_workbook(out_path)
 ws = wb.active

-# Colori per needs_human_review
 true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid")   # rosso
 false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
-col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
-if "llm_needs_human_review" in col_index:
-    c = col_index["llm_needs_human_review"]
-    for r in range(2, ws.max_row + 1):
-        val = ws.cell(row=r, column=c).value
-        if val is True:
-            ws.cell(row=r, column=c).fill = true_fill
-        elif val is False:
-            ws.cell(row=r, column=c).fill = false_fill

-if "final_needs_human_review" in col_index:
-    c = col_index["final_needs_human_review"]
-    for r in range(2, ws.max_row + 1):
-        val = ws.cell(row=r, column=c).value
-        if val is True:
-            ws.cell(row=r, column=c).fill = true_fill
-        elif val is False:
-            ws.cell(row=r, column=c).fill = false_fill
+col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
+
+for col_name in ["needs_human_review", "final_needs_human_review"]:
+    if col_name in col_index:
+        c = col_index[col_name]
+        for r in range(2, ws.max_row + 1):
+            val = ws.cell(row=r, column=c).value
+            if val is True:
+                ws.cell(row=r, column=c).fill = true_fill
+            elif val is False:
+                ws.cell(row=r, column=c).fill = false_fill

 wb.save(out_path)
-print(f"\n***Step 6: Retrieval e LLM ***\nExcel LLM salvato in {out_path}")
+print(f"\n***Step 6: Excel salvato in {out_path}")

 # --- Conteggio needs_human_review ---
-review_counts = df_llm["final_needs_human_review"].value_counts(dropna=False)
+review_counts = df_out["final_needs_human_review"].value_counts(dropna=False)
 true_count = review_counts.get(True, 0)
 false_count = review_counts.get(False, 0)
 print("\n--- Needs human review summary ---")
 print(f"needs_human_review = True : {true_count}")
-print(f"needs_human_review = False: {false_count}")
-
-
-# --- Step7: dataset finale su tutte le automazioni (solo testo + etichette) ---
-df_final = df_llm[[
-    "automation_text",
-    "llm_category",
-    "llm_subcategory",
-    "llm_gravity",
-    "llm_problem_type",
-    "final_needs_human_review"
-]].rename(columns={
-    "llm_category": "category",
-    "llm_subcategory": "subcategory",
-    "llm_gravity": "gravity",
-    "llm_problem_type": "problem_type"
-})
-
-# Normalizza stringhe
-for col in ["category", "subcategory", "gravity", "problem_type"]:
-    df_final[col] = df_final[col].fillna("").astype(str).str.strip()
-
-
-
-
-
-
-
-
-
-
-
-
-# Creazione DataFrame risultati
-# df_results = pd.DataFrame(output_rows)
-# output_path = "main/datasets/similarity_analysis.xlsx"
-# df_results.to_excel(output_path, index=False)
-
-#wb = load_workbook(output_path)
-#ws = wb.active
-
-#distanza_col_idx = None
-#for idx, cell in enumerate(ws[1], start=1):
-    #if cell.value == "distanza":
-        #distanza_col_idx = idx
-        #break
-#if distanza_col_idx is None:
-    #raise ValueError("Colonna 'distanza' non trovata!")
-
-# Applichiamo i colori in base al valore
-#for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=distanza_col_idx, max_col=distanza_col_idx):
-    #cell = row[0]
-    #try:
-        #val = float(cell.value)
-        #if val < 0.5:
-            #color = "90EE90"  # verde chiaro
-        #elif val < 1.0:
-            #color = "FFFF00"  # giallo
-        #else:
-            #color = "FF6347"  # rosso
-        #cell.fill = PatternFill(start_color=color, end_color=color, fill_type="solid")
-    #except:
-        #continue
-
-# Salva il file direttamente con colori applicati
-#wb.save(output_path)
-#print(f"Excel salvato in {output_path}")
+print(f"needs_human_review = False: {false_count}")