83 lines
3.2 KiB
Python
83 lines
3.2 KiB
Python
# --- Import librerie ---
|
|
import pandas as pd
|
|
from openai import AzureOpenAI
|
|
from sentence_transformers import SentenceTransformer
|
|
import numpy as np
|
|
import re
|
|
from openpyxl.styles import PatternFill
|
|
from openpyxl import load_workbook
|
|
from collections import Counter
|
|
from prompts.prompt import build_prompt_local
|
|
import warnings
|
|
import logging
|
|
from sentence_transformers import SentenceTransformer
|
|
import numpy as np
|
|
import pickle
|
|
import unicodedata
|
|
|
|
# ----- Caricare datasets -----
|
|
df_labeled = pd.read_excel("main/datasets/annotated_dataset.xlsx")
|
|
df_labeled = df_labeled.dropna(how="all") # rimuove righe completamente vuote
|
|
|
|
df_unlabeled = pd.read_excel("main/datasets/unlabeled_dataset.xlsx")
|
|
df_unlabeled = df_unlabeled.dropna(how="all")
|
|
|
|
print("***STEP 1***")
|
|
print("Dataset etichettato caricato. Numero righe:", len(df_labeled))
|
|
print("Dataset non etichettato caricato. Numero righe:", len(df_unlabeled))
|
|
|
|
# ----- Pulizia colonne ----
|
|
df_labeled = pd.read_excel("main/datasets/annotated_dataset.xlsx").dropna(how="all")
|
|
df_unlabeled = pd.read_excel("main/datasets/unlabeled_dataset.xlsx").dropna(how="all")
|
|
|
|
def clean_str(x):
|
|
if pd.isna(x):
|
|
return ""
|
|
s = str(x).strip().lower()
|
|
s = unicodedata.normalize("NFKC", s)
|
|
# rimuove tutti i caratteri non alfanumerici e spazi multipli, lascia solo lettere, numeri e spazi
|
|
s = re.sub(r'[^a-z0-9 ]+', '', s)
|
|
s = re.sub(r'\s+', ' ', s) # spazi multipli → 1 spazio
|
|
return s
|
|
|
|
# Applica pulizia su automation_id e folder
|
|
for df in [df_labeled, df_unlabeled]:
|
|
df["automation_id"] = df["automation_id"].apply(clean_str)
|
|
df["folder"] = df["folder"].apply(clean_str)
|
|
|
|
unlabeled_pairs = set(zip(df_unlabeled["automation_id"], df_unlabeled["folder"]))
|
|
# Filtro: rimuove dal dataset non etichettato le righe già presenti in df_labeled
|
|
labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"]))
|
|
mask = ~df_unlabeled[["automation_id", "folder"]].apply(tuple, axis=1).isin(labeled_pairs)
|
|
df_unlabeled_filtered = df_unlabeled[mask]
|
|
print("Numero righe df_unlabeled dopo aver rimosso quelle etichettate:", len(df_unlabeled_filtered))
|
|
|
|
# Trova coppie mancanti (debug)
|
|
missing_pairs = labeled_pairs - unlabeled_pairs
|
|
print("Numero righe etichettate non trovate nel dataset non etichettato:", len(missing_pairs))
|
|
if missing_pairs:
|
|
print("Coppie mancanti:")
|
|
for p in missing_pairs:
|
|
print(p)
|
|
|
|
|
|
# ----- Step 2: embeddings -----
|
|
# Silenzia warning generici
|
|
warnings.filterwarnings("ignore")
|
|
# Silenzia logging di transformers / sentence-transformers / HF hub
|
|
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
|
|
logging.getLogger("transformers").setLevel(logging.ERROR)
|
|
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
|
|
model = SentenceTransformer("all-MiniLM-L6-v2")
|
|
|
|
texts = df_labeled["automation"].tolist()
|
|
embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True)
|
|
embeddings = embeddings.astype("float32")
|
|
|
|
print("Shape embeddings ricalcolati:", embeddings.shape)
|
|
|
|
# ----- Step 3: salvare embeddings -----
|
|
with open("main/labeled_embeddings2.pkl", "wb") as f:
|
|
pickle.dump({"embeddings": embeddings, "automation_id": df_labeled["automation_id"].tolist()}, f)
|
|
|
|
print("Embeddings salvati con successo!") |