update
This commit is contained in:
parent
f5c16bbf3d
commit
110d70fc58
101
annotation.py
101
annotation.py
|
|
@ -14,42 +14,64 @@ from collections import Counter
|
|||
from prompts.prompt import build_prompt_local
|
||||
import warnings
|
||||
import logging
|
||||
|
||||
import unicodedata
|
||||
|
||||
# --- Configurazione ---
|
||||
endpoint = "https://gpt-sw-central-tap-security.openai.azure.com/"
|
||||
deployment = "gpt-4o"
|
||||
deployment = "gpt-5.1-chat-3"
|
||||
subscription_key = "8zufUIPs0Dijh0M6NpifkkDvxJHZMFtott7u8V8ySTYNcpYVoRbsJQQJ99BBACfhMk5XJ3w3AAABACOGr6sq"
|
||||
|
||||
client = AzureOpenAI(
|
||||
azure_endpoint=endpoint,
|
||||
api_key=subscription_key,
|
||||
api_version="2024-05-01-preview",
|
||||
api_version="2025-04-01-preview",
|
||||
)
|
||||
|
||||
# ----- Step 1: caricare datasets -----
|
||||
df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=";")
|
||||
df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep="\t", encoding="utf-8")
|
||||
#df_labeled = pd.read_excel("main/datasets/annotated_dataset.xlsx").dropna(how="all")
|
||||
df_labeled = pd.read_excel("main/datasets/annotated_dataset_updated.xlsx").dropna(how="all")
|
||||
df_unlabeled = pd.read_excel("main/datasets/unlabeled_dataset.xlsx").dropna(how="all")
|
||||
print("***STEP 1***\nDataset etichettato caricato. Numero righe:", len(df_labeled), "\nDataset non etichettato caricato. Numero righe:", len(df_unlabeled))
|
||||
df_labeled = df_labeled.rename(columns={"automation_id": "id"})
|
||||
df_unlabeled = df_unlabeled.rename(columns={"automation_id": "id"})
|
||||
|
||||
# Pulizia colonne
|
||||
def clean_id(x):
|
||||
if pd.isna(x):
|
||||
return ""
|
||||
s = str(x)
|
||||
m = re.search(r"\d+", s)
|
||||
return m.group(0) if m else s.strip()
|
||||
s = str(x).strip() # rimuove spazi
|
||||
s = s.strip('"').strip("'") # rimuove eventuali virgolette
|
||||
return s.lower()
|
||||
|
||||
df_labeled["automation_id"] = df_labeled["automation_id"].apply(clean_id)
|
||||
df_unlabeled["automation_id"] = df_unlabeled["automation_id"].apply(clean_id)
|
||||
df_labeled["folder"] = df_labeled["folder"].astype(str).str.strip()
|
||||
df_unlabeled["folder"] = df_unlabeled["folder"].astype(str).str.strip()
|
||||
def clean_folder(x):
|
||||
"""Pulizia folder: rimuove spazi multipli, normalizza unicode."""
|
||||
if pd.isna(x):
|
||||
return ""
|
||||
s = str(x).strip().lower()
|
||||
s = unicodedata.normalize("NFKC", s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s
|
||||
|
||||
labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"]))
|
||||
df_unlabeled_filtered = df_unlabeled[
|
||||
~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1)
|
||||
]
|
||||
print("Automazioni non etichettate rimanenti dopo la pulizia:", len(df_unlabeled_filtered))
|
||||
for df in [df_labeled, df_unlabeled]:
|
||||
df["id"] = df["id"].apply(clean_id)
|
||||
df["folder"] = df["folder"].apply(clean_folder)
|
||||
|
||||
labeled_pairs = set(zip(df_labeled["id"], df_labeled["folder"]))
|
||||
|
||||
# crea maschera: True = la riga NON è presente in labeled
|
||||
mask_unlabeled = ~df_unlabeled.apply(lambda r: (r["id"], r["folder"]) in labeled_pairs, axis=1)
|
||||
# filtra
|
||||
df_unlabeled_filtered = df_unlabeled[mask_unlabeled].copy()
|
||||
|
||||
print("Numero righe df_unlabeled dopo aver rimosso quelle già etichettate:", len(df_unlabeled_filtered))
|
||||
|
||||
unlabeled_pairs = set(zip(df_unlabeled["id"], df_unlabeled["folder"]))
|
||||
missing_in_unlabeled = labeled_pairs - unlabeled_pairs
|
||||
print("Numero coppie etichettate non presenti in unlabeled:", len(missing_in_unlabeled))
|
||||
if missing_in_unlabeled:
|
||||
print("Coppie mancanti:")
|
||||
for p in list(missing_in_unlabeled)[:50]: # stampa solo le prime 50 per comodità
|
||||
print(p)
|
||||
|
||||
# ----- Step 2: embeddings -----
|
||||
# Silenzia warning generici
|
||||
|
|
@ -62,6 +84,7 @@ logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
|
|||
print("\n***Step 2***\nEmbeddings")
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
#with open("main/labeled_embeddings_71.pkl", "rb") as f:
|
||||
with open("main/labeled_embeddings.pkl", "rb") as f:
|
||||
data = pickle.load(f)
|
||||
|
||||
|
|
@ -70,7 +93,7 @@ print("Shape embeddings:", embeddings.shape)
|
|||
|
||||
|
||||
|
||||
# ----- Step3: Creazione indice FAISS e calcolo similarity ---
|
||||
# ----- Step3: Creazione indice FAISS ---
|
||||
faiss.normalize_L2(embeddings)
|
||||
dimension = embeddings.shape[1]
|
||||
index = faiss.IndexFlatIP(dimension)
|
||||
|
|
@ -78,13 +101,12 @@ index.add(embeddings)
|
|||
print(f"\n***Step 3: Indice FAISS creato***.\nNumero di vettori nell'indice: {index.ntotal}")
|
||||
|
||||
|
||||
# ----- Step 4: Retrieval: 5 automazioni più simili -----
|
||||
# ----- Step 4: Retrieval (similarità cosine) -----
|
||||
k = 5
|
||||
output_rows = []
|
||||
df_sample = df_unlabeled_filtered.head(50).reset_index(drop=True)
|
||||
df_sample = df_unlabeled_filtered.head(10).reset_index(drop=True)
|
||||
llm_rows = []
|
||||
|
||||
# label in bale alla similarity
|
||||
def sim_label(sim: float) -> str:
|
||||
# più alto = più simile
|
||||
if sim >= 0.80:
|
||||
|
|
@ -122,8 +144,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
rank1_category = topk_cats[0] if topk_cats else ""
|
||||
majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
|
||||
consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
|
||||
|
||||
# per ognuna delle 5 automazioni simili
|
||||
|
||||
for rank in range(k):
|
||||
idx = int(indices[0][rank])
|
||||
sim = float(sims[0][rank])
|
||||
|
|
@ -164,7 +185,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
{"role": "system", "content": prompt},
|
||||
{"role": "user", "content": f'automation to evaluate: {query_text}'}
|
||||
],
|
||||
temperature=0,
|
||||
reasoning_effort= "low"
|
||||
)
|
||||
content = resp.choices[0].message.content.strip()
|
||||
|
||||
|
|
@ -185,7 +206,7 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
|
||||
# (4) Salvataggio di 1 riga per automazione con:
|
||||
# - metriche retrieval (rank1/majority/consistency)
|
||||
# - output dell'LLM
|
||||
# - output dell'LLM (scores + label finale + review flag)
|
||||
llm_category = str(parsed.get("category", "")).strip()
|
||||
llm_subcategory = str(parsed.get("subcategory", "")).strip()
|
||||
llm_problem_type = str(parsed.get("problem_type", "")).strip()
|
||||
|
|
@ -194,27 +215,28 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
llm_subcategory = ""
|
||||
llm_problem_type = "none"
|
||||
llm_gravity = "NONE"
|
||||
# di default l'etichetta finale assegnata è quella del LLM - revisionata se review=true
|
||||
# di default l'etichetta assegnata è quella del LLM - rivista se review=true
|
||||
final_category = llm_category
|
||||
final_subcategory = llm_subcategory
|
||||
final_problem_type = llm_problem_type
|
||||
final_gravity = llm_gravity
|
||||
|
||||
|
||||
# ================= REVIEW LOGIC =================
|
||||
|
||||
if top1_similarity_label == "Debole" or top1_similarity_label == "Similarità instabile":
|
||||
needs_human_review = True
|
||||
needs_review = True
|
||||
else:
|
||||
needs_human_review = False
|
||||
needs_review = False
|
||||
|
||||
final_needs_human_review = needs_human_review
|
||||
|
||||
final_needs_review = needs_review
|
||||
# ================= HUMAN REVIEW LOGIC =================
|
||||
aligned_strong = (
|
||||
llm_category == majority_category
|
||||
and llm_category == rank1_category
|
||||
and llm_category != ""
|
||||
)
|
||||
|
||||
OVERRIDE_MIN_SIMILARITY = 0.38
|
||||
OVERRIDE_MIN_SIMILARITY = 0.39
|
||||
OVERRIDE_MIN_CONSISTENCY = 0.60
|
||||
|
||||
good_retrieval = (
|
||||
|
|
@ -223,12 +245,12 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
)
|
||||
|
||||
if aligned_strong and good_retrieval:
|
||||
final_needs_human_review = False
|
||||
final_needs_review = False
|
||||
# =====================================================
|
||||
|
||||
|
||||
llm_rows.append({
|
||||
"id": row.get("automation_id", ""),
|
||||
"id": row.get("id", ""),
|
||||
"folder": row.get("folder", ""),
|
||||
"automation_text": query_text,
|
||||
|
||||
|
|
@ -246,8 +268,8 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
"llm_problem_type": llm_problem_type,
|
||||
"llm_gravity": llm_gravity,
|
||||
|
||||
"needs_review": needs_human_review,
|
||||
"final_needs_review": final_needs_human_review,
|
||||
"needs_review": needs_review,
|
||||
"final_needs_review": final_needs_review,
|
||||
|
||||
# FINAL
|
||||
"final_category": final_category,
|
||||
|
|
@ -261,16 +283,17 @@ for count, (_, row) in enumerate(df_sample.iterrows(), start=1):
|
|||
|
||||
# ----- Step 6: output Excel -----
|
||||
df_out = pd.DataFrame(llm_rows)
|
||||
out_path = "main/datasets/labeling_first50.xlsx"
|
||||
out_path = "main/datasets/labeling_2_500.xlsx"
|
||||
df_out.to_excel(out_path, index=False)
|
||||
|
||||
wb = load_workbook(out_path)
|
||||
ws = wb.active
|
||||
|
||||
# colore delle colonne review
|
||||
true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso
|
||||
false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
|
||||
|
||||
col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
|
||||
|
||||
for col_name in ["needs_review", "final_needs_review"]:
|
||||
if col_name in col_index:
|
||||
c = col_index[col_name]
|
||||
|
|
@ -284,7 +307,7 @@ for col_name in ["needs_review", "final_needs_review"]:
|
|||
wb.save(out_path)
|
||||
print(f"\n***Step 6: Excel salvato in {out_path}")
|
||||
|
||||
# --- Conteggio final_needs_review ---
|
||||
# --- Conteggio needs_human_review ---
|
||||
review_counts = df_out["final_needs_review"].value_counts(dropna=False)
|
||||
true_count = review_counts.get(True, 0)
|
||||
false_count = review_counts.get(False, 0)
|
||||
|
|
|
|||
|
|
@ -0,0 +1,127 @@
|
|||
# --- Import librerie ---
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import unicodedata
|
||||
import re
|
||||
import warnings
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import pickle
|
||||
|
||||
# ----- Percorsi file -----
|
||||
LABELED_IN = "main/datasets/annotated_dataset.xlsx"
|
||||
REVIEWED = "main/datasets/first500_reviewed.xlsx"
|
||||
LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx"
|
||||
|
||||
# ----- Funzioni di pulizia -----
|
||||
def clean(x):
|
||||
if pd.isna(x):
|
||||
return ""
|
||||
return str(x).strip()
|
||||
|
||||
def normalize_problem_type(x):
|
||||
x = clean(x).upper()
|
||||
if x == "S":
|
||||
return "RULE_SPECIFIC"
|
||||
if x == "G":
|
||||
return "GENERIC"
|
||||
return x
|
||||
|
||||
def normalize_severity(x):
|
||||
return clean(x).upper()
|
||||
|
||||
def clean_id(x):
|
||||
if pd.isna(x):
|
||||
return ""
|
||||
s = str(x).strip().strip('"').strip("'")
|
||||
return s.lower()
|
||||
|
||||
def clean_folder(x):
|
||||
if pd.isna(x):
|
||||
return ""
|
||||
s = str(x).strip().lower()
|
||||
s = unicodedata.normalize("NFKC", s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s
|
||||
|
||||
# ----- Step 1: caricare datasets -----
|
||||
df_labeled = pd.read_excel(LABELED_IN)
|
||||
df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy()
|
||||
df_labeled = df_labeled.dropna(how="all")
|
||||
df_labeled = df_labeled.rename(columns={"automation_id": "id"})
|
||||
|
||||
df_rev = pd.read_excel(REVIEWED)
|
||||
|
||||
# Normalizzazione problem_type e severity
|
||||
if "error_type" in df_labeled.columns:
|
||||
df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type)
|
||||
|
||||
# Costruzione dataset pulito dai primi 500
|
||||
rows = []
|
||||
for _, r in df_rev.iterrows():
|
||||
category = clean(r["final_category"])
|
||||
subcategory = clean(r["final_subcategory"])
|
||||
error_type = normalize_problem_type(r["final_problem_type"])
|
||||
severity = normalize_severity(r["final_gravity"])
|
||||
|
||||
# Coerenza HARMLESS
|
||||
if category.upper() == "HARMLESS":
|
||||
subcategory = ""
|
||||
error_type = "none"
|
||||
severity = "none"
|
||||
|
||||
rows.append({
|
||||
"id": clean(r["id"]),
|
||||
"folder": clean(r["folder"]),
|
||||
"automation": clean(r["automation_text"]),
|
||||
"description": clean(r.get("llm_rationale", "")),
|
||||
"category": category,
|
||||
"subcategory": subcategory,
|
||||
"error_type": error_type,
|
||||
"severity": severity,
|
||||
"borderline": clean(r["borderline"]),
|
||||
})
|
||||
|
||||
df_new = pd.DataFrame(rows)
|
||||
|
||||
# Normalizzazione valori 'none'
|
||||
df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x)
|
||||
df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x)
|
||||
|
||||
# Rimuovi righe senza categoria
|
||||
df_new = df_new[df_new["category"] != ""].copy()
|
||||
|
||||
# Pulizia id e folder in entrambi i dataset
|
||||
for df in [df_labeled, df_new]:
|
||||
df["id"] = df["id"].apply(clean_id)
|
||||
df["folder"] = df["folder"].apply(clean_folder)
|
||||
|
||||
# Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new
|
||||
new_keys = set(zip(df_new["id"], df_new["folder"]))
|
||||
df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy()
|
||||
|
||||
# Concat finale
|
||||
df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("")
|
||||
|
||||
# Salva dataset aggiornato
|
||||
df_final.to_excel(LABELED_OUT, index=False)
|
||||
print("✅ Merge completato")
|
||||
print("Righe iniziali:", len(df_labeled))
|
||||
print("Righe aggiunte:", len(df_rev))
|
||||
print("Totale finale:", len(df_final))
|
||||
|
||||
# ----- Step 2: calcolo embeddings -----
|
||||
warnings.filterwarnings("ignore")
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
texts = df_final["automation"].tolist()
|
||||
embeddings = model.encode(
|
||||
texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True
|
||||
).astype("float32")
|
||||
|
||||
print("Shape embeddings ricalcolati:", embeddings.shape)
|
||||
|
||||
# ----- Step 3: salva embeddings -----
|
||||
with open("main/labeled_embeddings.pkl", "wb") as f:
|
||||
pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f)
|
||||
|
||||
print("Embeddings salvati con successo!")
|
||||
Loading…
Reference in New Issue