This commit is contained in:
Arianna Di Serio 2026-03-11 15:38:45 +01:00
parent f5c16bbf3d
commit 110d70fc58
2 changed files with 189 additions and 39 deletions

View File

@ -14,42 +14,64 @@ from collections import Counter
from prompts.prompt import build_prompt_local
import warnings
import logging
import unicodedata
# --- Configurazione ---
endpoint = "https://gpt-sw-central-tap-security.openai.azure.com/"
deployment = "gpt-4o"
deployment = "gpt-5.1-chat-3"
subscription_key = "8zufUIPs0Dijh0M6NpifkkDvxJHZMFtott7u8V8ySTYNcpYVoRbsJQQJ99BBACfhMk5XJ3w3AAABACOGr6sq"
client = AzureOpenAI(
azure_endpoint=endpoint,
api_key=subscription_key,
api_version="2024-05-01-preview",
api_version="2025-04-01-preview",
)
# ----- Step 1: caricare datasets -----
df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=";")
df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep="\t", encoding="utf-8")
#df_labeled = pd.read_excel("main/datasets/annotated_dataset.xlsx").dropna(how="all")
df_labeled = pd.read_excel("main/datasets/annotated_dataset_updated.xlsx").dropna(how="all")
df_unlabeled = pd.read_excel("main/datasets/unlabeled_dataset.xlsx").dropna(how="all")
print("***STEP 1***\nDataset etichettato caricato. Numero righe:", len(df_labeled), "\nDataset non etichettato caricato. Numero righe:", len(df_unlabeled))
df_labeled = df_labeled.rename(columns={"automation_id": "id"})
df_unlabeled = df_unlabeled.rename(columns={"automation_id": "id"})
# Pulizia colonne
def clean_id(x):
if pd.isna(x):
return ""
s = str(x)
m = re.search(r"\d+", s)
return m.group(0) if m else s.strip()
s = str(x).strip() # rimuove spazi
s = s.strip('"').strip("'") # rimuove eventuali virgolette
return s.lower()
df_labeled["automation_id"] = df_labeled["automation_id"].apply(clean_id)
df_unlabeled["automation_id"] = df_unlabeled["automation_id"].apply(clean_id)
df_labeled["folder"] = df_labeled["folder"].astype(str).str.strip()
df_unlabeled["folder"] = df_unlabeled["folder"].astype(str).str.strip()
def clean_folder(x):
"""Pulizia folder: rimuove spazi multipli, normalizza unicode."""
if pd.isna(x):
return ""
s = str(x).strip().lower()
s = unicodedata.normalize("NFKC", s)
s = re.sub(r'\s+', ' ', s)
return s
labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"]))
df_unlabeled_filtered = df_unlabeled[
~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1)
]
print("Automazioni non etichettate rimanenti dopo la pulizia:", len(df_unlabeled_filtered))
for df in [df_labeled, df_unlabeled]:
df["id"] = df["id"].apply(clean_id)
df["folder"] = df["folder"].apply(clean_folder)
labeled_pairs = set(zip(df_labeled["id"], df_labeled["folder"]))
# crea maschera: True = la riga NON è presente in labeled
mask_unlabeled = ~df_unlabeled.apply(lambda r: (r["id"], r["folder"]) in labeled_pairs, axis=1)
# filtra
df_unlabeled_filtered = df_unlabeled[mask_unlabeled].copy()
print("Numero righe df_unlabeled dopo aver rimosso quelle già etichettate:", len(df_unlabeled_filtered))
unlabeled_pairs = set(zip(df_unlabeled["id"], df_unlabeled["folder"]))
missing_in_unlabeled = labeled_pairs - unlabeled_pairs
print("Numero coppie etichettate non presenti in unlabeled:", len(missing_in_unlabeled))
if missing_in_unlabeled:
print("Coppie mancanti:")
for p in list(missing_in_unlabeled)[:50]: # stampa solo le prime 50 per comodità
print(p)
# ----- Step 2: embeddings -----
# Silenzia warning generici
@ -62,6 +84,7 @@ logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
print("\n***Step 2***\nEmbeddings")
model = SentenceTransformer("all-MiniLM-L6-v2")
#with open("main/labeled_embeddings_71.pkl", "rb") as f:
with open("main/labeled_embeddings.pkl", "rb") as f:
data = pickle.load(f)
@ -70,7 +93,7 @@ print("Shape embeddings:", embeddings.shape)
# ----- Step3: Creazione indice FAISS e calcolo similarity ---
# ----- Step3: Creazione indice FAISS ---
faiss.normalize_L2(embeddings)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
@ -78,13 +101,12 @@ index.add(embeddings)
print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}")
# ----- Step 4: Retrieval: 5 automazioni più simili -----
# ----- Step 4: Retrieval (similarità cosine) -----
k = 5
output_rows = []
df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True)
df_sample = df_unlabeled_filtered.head(10).reset_index(drop=True)
llm_rows = []
# label in bale alla similarity
def sim_label(sim: float) -> str:
# più alto = più simile
if sim >= 0.80:
@ -122,8 +144,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
rank1_category = topk_cats[0] if topk_cats else ""
majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
# per ognuna delle 5 automazioni simili
for rank in range(k):
idx = int(indices[0][rank])
sim = float(sims[0][rank])
@ -164,7 +185,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
{"role": "system", "content": prompt},
{"role": "user", "content": f'automation to evaluate: {query_text}'}
],
temperature=0,
reasoning_effort= "low"
)
content = resp.choices[0].message.content.strip()
@ -185,7 +206,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
# (4) Salvataggio di 1 riga per automazione con:
# - metriche retrieval (rank1/majority/consistency)
# - output dell'LLM
# - output dell'LLM (scores + label finale + review flag)
llm_category = str(parsed.get("category", "")).strip()
llm_subcategory = str(parsed.get("subcategory", "")).strip()
llm_problem_type = str(parsed.get("problem_type", "")).strip()
@ -194,27 +215,28 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
llm_subcategory = ""
llm_problem_type = "none"
llm_gravity = "NONE"
# di default l'etichetta finale assegnata è quella del LLM - revisionata se review=true
# di default l'etichetta assegnata è quella del LLM - rivista se review=true
final_category = llm_category
final_subcategory = llm_subcategory
final_problem_type = llm_problem_type
final_gravity = llm_gravity
# ================= REVIEW LOGIC =================
if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile":
needs_human_review = True
needs_review = True
else:
needs_human_review = False
needs_review = False
final_needs_human_review = needs_human_review
final_needs_review = needs_review
# ================= HUMAN REVIEW LOGIC =================
aligned_strong = (
llm_category == majority_category
and llm_category == rank1_category
and llm_category != ""
)
OVERRIDE_MIN_SIMILARITY = 0.38
OVERRIDE_MIN_SIMILARITY = 0.39
OVERRIDE_MIN_CONSISTENCY = 0.60
good_retrieval = (
@ -223,12 +245,12 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
)
if aligned_strong and good_retrieval:
final_needs_human_review = False
final_needs_review = False
# =====================================================
llm_rows.append({
"id": row.get("automation_id", ""),
"id": row.get("id", ""),
"folder": row.get("folder", ""),
"automation_text": query_text,
@ -246,8 +268,8 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
"llm_problem_type": llm_problem_type,
"llm_gravity": llm_gravity,
"needs_review": needs_human_review,
"final_needs_review": final_needs_human_review,
"needs_review": needs_review,
"final_needs_review": final_needs_review,
# FINAL
"final_category": final_category,
@ -261,16 +283,17 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
# ----- Step 6: output Excel -----
df_out = pd.DataFrame(llm_rows)
out_path = "main/datasets/labeling_first50.xlsx"
out_path = "main/datasets/labeling_2_500.xlsx"
df_out.to_excel(out_path, index=False)
wb = load_workbook(out_path)
ws = wb.active
# colore delle colonne review
true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso
false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
for col_name in ["needs_review", "final_needs_review"]:
if col_name in col_index:
c = col_index[col_name]
@ -284,7 +307,7 @@ for col_name in ["needs_review", "final_needs_review"]:
wb.save(out_path)
print(f"\n***Step 6: Excel salvato in {out_path}")
# --- Conteggio final_needs_review ---
# --- Conteggio needs_human_review ---
review_counts = df_out["final_needs_review"].value_counts(dropna=False)
true_count = review_counts.get(True, 0)
false_count = review_counts.get(False, 0)

127
merge.py Normal file
View File

@ -0,0 +1,127 @@
# --- Import librerie ---
import pandas as pd
import numpy as np
import unicodedata
import re
import warnings
from sentence_transformers import SentenceTransformer
import pickle
# ----- Percorsi file -----
LABELED_IN = "main/datasets/annotated_dataset.xlsx"
REVIEWED = "main/datasets/first500_reviewed.xlsx"
LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx"
# ----- Funzioni di pulizia -----
def clean(x):
if pd.isna(x):
return ""
return str(x).strip()
def normalize_problem_type(x):
x = clean(x).upper()
if x == "S":
return "RULE_SPECIFIC"
if x == "G":
return "GENERIC"
return x
def normalize_severity(x):
return clean(x).upper()
def clean_id(x):
if pd.isna(x):
return ""
s = str(x).strip().strip('"').strip("'")
return s.lower()
def clean_folder(x):
if pd.isna(x):
return ""
s = str(x).strip().lower()
s = unicodedata.normalize("NFKC", s)
s = re.sub(r'\s+', ' ', s)
return s
# ----- Step 1: caricare datasets -----
df_labeled = pd.read_excel(LABELED_IN)
df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy()
df_labeled = df_labeled.dropna(how="all")
df_labeled = df_labeled.rename(columns={"automation_id": "id"})
df_rev = pd.read_excel(REVIEWED)
# Normalizzazione problem_type e severity
if "error_type" in df_labeled.columns:
df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type)
# Costruzione dataset pulito dai primi 500
rows = []
for _, r in df_rev.iterrows():
category = clean(r["final_category"])
subcategory = clean(r["final_subcategory"])
error_type = normalize_problem_type(r["final_problem_type"])
severity = normalize_severity(r["final_gravity"])
# Coerenza HARMLESS
if category.upper() == "HARMLESS":
subcategory = ""
error_type = "none"
severity = "none"
rows.append({
"id": clean(r["id"]),
"folder": clean(r["folder"]),
"automation": clean(r["automation_text"]),
"description": clean(r.get("llm_rationale", "")),
"category": category,
"subcategory": subcategory,
"error_type": error_type,
"severity": severity,
"borderline": clean(r["borderline"]),
})
df_new = pd.DataFrame(rows)
# Normalizzazione valori 'none'
df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x)
df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x)
# Rimuovi righe senza categoria
df_new = df_new[df_new["category"] != ""].copy()
# Pulizia id e folder in entrambi i dataset
for df in [df_labeled, df_new]:
df["id"] = df["id"].apply(clean_id)
df["folder"] = df["folder"].apply(clean_folder)
# Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new
new_keys = set(zip(df_new["id"], df_new["folder"]))
df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy()
# Concat finale
df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("")
# Salva dataset aggiornato
df_final.to_excel(LABELED_OUT, index=False)
print("✅ Merge completato")
print("Righe iniziali:", len(df_labeled))
print("Righe aggiunte:", len(df_rev))
print("Totale finale:", len(df_final))
# ----- Step 2: calcolo embeddings -----
warnings.filterwarnings("ignore")
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = df_final["automation"].tolist()
embeddings = model.encode(
texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True
).astype("float32")
print("Shape embeddings ricalcolati:", embeddings.shape)
# ----- Step 3: salva embeddings -----
with open("main/labeled_embeddings.pkl", "wb") as f:
pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f)
print("Embeddings salvati con successo!")