dataset/merge.py

# --- Import librerie ---
import pandas as pd
import numpy as np
import unicodedata
import re
import warnings
from sentence_transformers import SentenceTransformer
import pickle

# ----- Percorsi file -----
LABELED_IN = "main/datasets/annotated_dataset.xlsx"
REVIEWED = "main/datasets/first500_reviewed.xlsx"
LABELED_OUT = "main/datasets/annotated_dataset_updated.xlsx"

# ----- Funzioni di pulizia -----
def clean(x):
    if pd.isna(x):
        return ""
    return str(x).strip()

def normalize_problem_type(x):
    x = clean(x).upper()
    if x == "S":
        return "RULE_SPECIFIC"
    if x == "G":
        return "GENERIC"
    return x

def normalize_severity(x):
    return clean(x).upper()

def clean_id(x):
    if pd.isna(x):
        return ""
    s = str(x).strip().strip('"').strip("'")
    return s.lower()

def clean_folder(x):
    if pd.isna(x):
        return ""
    s = str(x).strip().lower()
    s = unicodedata.normalize("NFKC", s)
    s = re.sub(r'\s+', ' ', s)
    return s

# ----- Step 1: caricare datasets -----
df_labeled = pd.read_excel(LABELED_IN)
df_labeled = df_labeled.loc[:, ~df_labeled.columns.str.contains("^Unnamed")].copy()
df_labeled = df_labeled.dropna(how="all")
df_labeled = df_labeled.rename(columns={"automation_id": "id"})

df_rev = pd.read_excel(REVIEWED)

# Normalizzazione problem_type e severity
if "error_type" in df_labeled.columns:
    df_labeled["error_type"] = df_labeled["error_type"].apply(normalize_problem_type)

# Costruzione dataset pulito dai primi 500
rows = []
for _, r in df_rev.iterrows():
    category = clean(r["final_category"])
    subcategory = clean(r["final_subcategory"])
    error_type = normalize_problem_type(r["final_problem_type"])
    severity = normalize_severity(r["final_gravity"])

    # Coerenza HARMLESS
    if category.upper() == "HARMLESS":
        subcategory = ""
        error_type = "none"
        severity = "none"

    rows.append({
        "id": clean(r["id"]),
        "folder": clean(r["folder"]),
        "automation": clean(r["automation_text"]),
        "description": clean(r.get("llm_rationale", "")),
        "category": category,
        "subcategory": subcategory,
        "error_type": error_type,
        "severity": severity,
        "borderline": clean(r["borderline"]),
    })

df_new = pd.DataFrame(rows)

# Normalizzazione valori 'none'
df_new["error_type"] = df_new["error_type"].apply(lambda x: x.lower() if x.lower() == "none" else x)
df_new["severity"] = df_new["severity"].apply(lambda x: x.lower() if x.lower() == "none" else x)

# Rimuovi righe senza categoria
df_new = df_new[df_new["category"] != ""].copy()

# Pulizia id e folder in entrambi i dataset
for df in [df_labeled, df_new]:
    df["id"] = df["id"].apply(clean_id)
    df["folder"] = df["folder"].apply(clean_folder)

# Rimuovere duplicati: eliminare dal labeled righe già presenti in df_new
new_keys = set(zip(df_new["id"], df_new["folder"]))
df_labeled_clean = df_labeled[~df_labeled.apply(lambda r: (r["id"], r["folder"]) in new_keys, axis=1)].copy()

# Concat finale
df_final = pd.concat([df_labeled_clean, df_new], ignore_index=True).fillna("")

# Salva dataset aggiornato
df_final.to_excel(LABELED_OUT, index=False)
print("✅ Merge completato")
print("Righe iniziali:", len(df_labeled))
print("Righe aggiunte:", len(df_rev))
print("Totale finale:", len(df_final))

# ----- Step 2: calcolo embeddings -----
warnings.filterwarnings("ignore")
model = SentenceTransformer("all-MiniLM-L6-v2")

texts = df_final["automation"].tolist()
embeddings = model.encode(
    texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True
).astype("float32")

print("Shape embeddings ricalcolati:", embeddings.shape)

# ----- Step 3: salva embeddings -----
with open("main/labeled_embeddings.pkl", "wb") as f:
    pickle.dump({"embeddings": embeddings, "id": df_final["id"].tolist()}, f)

print("Embeddings salvati con successo!")