dataset/merge.py

127 lines
3.9 KiB
Python

# --- Import librerie ---
import pandas as pd
import numpy as np
import unicodedata
import re
import warnings
from sentence_transformers import SentenceTransformer
import pickle
# ----- Percorsi file -----
LABELED_IN = "main/datasets/annotated_dataset.xlsx"
REVIEWED = "main/datasets/first500_reviewed.xlsx"
LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx"
# ----- Funzioni di pulizia -----
def clean(x):
if pd.isna(x):
return ""
return str(x).strip()
def normalize_problem_type(x):
x = clean(x).upper()
if x == "S":
return "RULE_SPECIFIC"
if x == "G":
return "GENERIC"
return x
def normalize_severity(x):
return clean(x).upper()
def clean_id(x):
if pd.isna(x):
return ""
s = str(x).strip().strip('"').strip("'")
return s.lower()
def clean_folder(x):
if pd.isna(x):
return ""
s = str(x).strip().lower()
s = unicodedata.normalize("NFKC", s)
s = re.sub(r'\s+', ' ', s)
return s
# ----- Step 1: caricare datasets -----
df_labeled = pd.read_excel(LABELED_IN)
df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy()
df_labeled = df_labeled.dropna(how="all")
df_labeled = df_labeled.rename(columns={"automation_id": "id"})
df_rev = pd.read_excel(REVIEWED)
# Normalizzazione problem_type e severity
if "error_type" in df_labeled.columns:
df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type)
# Costruzione dataset pulito dai primi 500
rows = []
for _, r in df_rev.iterrows():
category = clean(r["final_category"])
subcategory = clean(r["final_subcategory"])
error_type = normalize_problem_type(r["final_problem_type"])
severity = normalize_severity(r["final_gravity"])
# Coerenza HARMLESS
if category.upper() == "HARMLESS":
subcategory = ""
error_type = "none"
severity = "none"
rows.append({
"id": clean(r["id"]),
"folder": clean(r["folder"]),
"automation": clean(r["automation_text"]),
"description": clean(r.get("llm_rationale", "")),
"category": category,
"subcategory": subcategory,
"error_type": error_type,
"severity": severity,
"borderline": clean(r["borderline"]),
})
df_new = pd.DataFrame(rows)
# Normalizzazione valori 'none'
df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x)
df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x)
# Rimuovi righe senza categoria
df_new = df_new[df_new["category"] != ""].copy()
# Pulizia id e folder in entrambi i dataset
for df in [df_labeled, df_new]:
df["id"] = df["id"].apply(clean_id)
df["folder"] = df["folder"].apply(clean_folder)
# Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new
new_keys = set(zip(df_new["id"], df_new["folder"]))
df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy()
# Concat finale
df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("")
# Salva dataset aggiornato
df_final.to_excel(LABELED_OUT, index=False)
print("✅ Merge completato")
print("Righe iniziali:", len(df_labeled))
print("Righe aggiunte:", len(df_rev))
print("Totale finale:", len(df_final))
# ----- Step 2: calcolo embeddings -----
warnings.filterwarnings("ignore")
model = SentenceTransformer("all-MiniLM-L6-v2")
texts = df_final["automation"].tolist()
embeddings = model.encode(
texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True
).astype("float32")
print("Shape embeddings ricalcolati:", embeddings.shape)
# ----- Step 3: salva embeddings -----
with open("main/labeled_embeddings.pkl", "wb") as f:
pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f)
print("Embeddings salvati con successo!")