127 lines
3.9 KiB
Python
127 lines
3.9 KiB
Python
# --- Import librerie ---
|
|
import pandas as pd
|
|
import numpy as np
|
|
import unicodedata
|
|
import re
|
|
import warnings
|
|
from sentence_transformers import SentenceTransformer
|
|
import pickle
|
|
|
|
# ----- Percorsi file -----
|
|
LABELED_IN = "main/datasets/annotated_dataset.xlsx"
|
|
REVIEWED = "main/datasets/first500_reviewed.xlsx"
|
|
LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx"
|
|
|
|
# ----- Funzioni di pulizia -----
|
|
def clean(x):
|
|
if pd.isna(x):
|
|
return ""
|
|
return str(x).strip()
|
|
|
|
def normalize_problem_type(x):
|
|
x = clean(x).upper()
|
|
if x == "S":
|
|
return "RULE_SPECIFIC"
|
|
if x == "G":
|
|
return "GENERIC"
|
|
return x
|
|
|
|
def normalize_severity(x):
|
|
return clean(x).upper()
|
|
|
|
def clean_id(x):
|
|
if pd.isna(x):
|
|
return ""
|
|
s = str(x).strip().strip('"').strip("'")
|
|
return s.lower()
|
|
|
|
def clean_folder(x):
|
|
if pd.isna(x):
|
|
return ""
|
|
s = str(x).strip().lower()
|
|
s = unicodedata.normalize("NFKC", s)
|
|
s = re.sub(r'\s+', ' ', s)
|
|
return s
|
|
|
|
# ----- Step 1: caricare datasets -----
|
|
df_labeled = pd.read_excel(LABELED_IN)
|
|
df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy()
|
|
df_labeled = df_labeled.dropna(how="all")
|
|
df_labeled = df_labeled.rename(columns={"automation_id": "id"})
|
|
|
|
df_rev = pd.read_excel(REVIEWED)
|
|
|
|
# Normalizzazione problem_type e severity
|
|
if "error_type" in df_labeled.columns:
|
|
df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type)
|
|
|
|
# Costruzione dataset pulito dai primi 500
|
|
rows = []
|
|
for _, r in df_rev.iterrows():
|
|
category = clean(r["final_category"])
|
|
subcategory = clean(r["final_subcategory"])
|
|
error_type = normalize_problem_type(r["final_problem_type"])
|
|
severity = normalize_severity(r["final_gravity"])
|
|
|
|
# Coerenza HARMLESS
|
|
if category.upper() == "HARMLESS":
|
|
subcategory = ""
|
|
error_type = "none"
|
|
severity = "none"
|
|
|
|
rows.append({
|
|
"id": clean(r["id"]),
|
|
"folder": clean(r["folder"]),
|
|
"automation": clean(r["automation_text"]),
|
|
"description": clean(r.get("llm_rationale", "")),
|
|
"category": category,
|
|
"subcategory": subcategory,
|
|
"error_type": error_type,
|
|
"severity": severity,
|
|
"borderline": clean(r["borderline"]),
|
|
})
|
|
|
|
df_new = pd.DataFrame(rows)
|
|
|
|
# Normalizzazione valori 'none'
|
|
df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x)
|
|
df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x)
|
|
|
|
# Rimuovi righe senza categoria
|
|
df_new = df_new[df_new["category"] != ""].copy()
|
|
|
|
# Pulizia id e folder in entrambi i dataset
|
|
for df in [df_labeled, df_new]:
|
|
df["id"] = df["id"].apply(clean_id)
|
|
df["folder"] = df["folder"].apply(clean_folder)
|
|
|
|
# Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new
|
|
new_keys = set(zip(df_new["id"], df_new["folder"]))
|
|
df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy()
|
|
|
|
# Concat finale
|
|
df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("")
|
|
|
|
# Salva dataset aggiornato
|
|
df_final.to_excel(LABELED_OUT, index=False)
|
|
print("✅ Merge completato")
|
|
print("Righe iniziali:", len(df_labeled))
|
|
print("Righe aggiunte:", len(df_rev))
|
|
print("Totale finale:", len(df_final))
|
|
|
|
# ----- Step 2: calcolo embeddings -----
|
|
warnings.filterwarnings("ignore")
|
|
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
texts = df_final["automation"].tolist()
|
|
embeddings = model.encode(
|
|
texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True
|
|
).astype("float32")
|
|
|
|
print("Shape embeddings ricalcolati:", embeddings.shape)
|
|
|
|
# ----- Step 3: salva embeddings -----
|
|
with open("main/labeled_embeddings.pkl", "wb") as f:
|
|
pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f)
|
|
|
|
print("Embeddings salvati con successo!") |