Upload files to "/"
This commit is contained in:
commit
3edc9bb204
|
|
@ -0,0 +1,363 @@
|
|||
# --- Import librerie ---
|
||||
import pandas as pd
|
||||
from openai import AzureOpenAI
|
||||
import pickle
|
||||
from sentence_transformers import SentenceTransformer
|
||||
import numpy as np
|
||||
import faiss
|
||||
import openpyxl
|
||||
import re
|
||||
import json
|
||||
from openpyxl.styles import PatternFill
|
||||
from openpyxl import load_workbook
|
||||
from collections import Counter
|
||||
from prompts.prompt import build_prompt_local
|
||||
import warnings
|
||||
import logging
|
||||
|
||||
|
||||
# --- Configurazione ---
|
||||
endpoint = "https://gpt-sw-central-tap-security.openai.azure.com/"
|
||||
deployment = "gpt-4o"
|
||||
subscription_key = "8zufUIPs0Dijh0M6NpifkkDvxJHZMFtott7u8V8ySTYNcpYVoRbsJQQJ99BBACfhMk5XJ3w3AAABACOGr6sq"
|
||||
|
||||
client = AzureOpenAI(
|
||||
azure_endpoint=endpoint,
|
||||
api_key=subscription_key,
|
||||
api_version="2024-05-01-preview",
|
||||
)
|
||||
|
||||
# ----- Step 1: caricare datasets -----
|
||||
df_labeled = pd.read_csv("main/datasets/annotated_dataset.csv", encoding="cp1252", sep=';') # colonne: automation, description, category, subcategory, problem_type, gravity
|
||||
df_unlabeled = pd.read_csv("main/datasets/unlabeled_dataset.csv", sep='\t', encoding='utf-8')
|
||||
print("***STEP 1***\nDataset etichettato caricato. Numero righe:", len(df_labeled), "\nDataset non etichettato caricato. Numero righe:", len(df_unlabeled))
|
||||
|
||||
def clean_id(x):
|
||||
if pd.isna(x):
|
||||
return ""
|
||||
s = str(x)
|
||||
m = re.search(r"\d+", s) # prima sequenza di cifre
|
||||
return m.group(0) if m else s.strip()
|
||||
|
||||
df_labeled["automation_id"] = df_labeled["automation_id"].apply(clean_id)
|
||||
df_unlabeled["automation_id"] = df_unlabeled["automation_id"].apply(clean_id)
|
||||
df_labeled["folder"] = df_labeled["folder"].astype(str).str.strip()
|
||||
df_unlabeled["folder"] = df_unlabeled["folder"].astype(str).str.strip()
|
||||
|
||||
labeled_pairs = set(zip(df_labeled["automation_id"], df_labeled["folder"])) # Crea set di coppie già etichettate
|
||||
|
||||
df_unlabeled_filtered = df_unlabeled[
|
||||
~df_unlabeled.apply(lambda row: (row["automation_id"], row["folder"]) in labeled_pairs, axis=1) # Filtra il dataset non etichettato
|
||||
]
|
||||
print("Automazioni non etichettate rimanenti dopo la pulizia:", len(df_unlabeled_filtered))
|
||||
|
||||
|
||||
# --- Step 2: embeddings ---
|
||||
# Silenzia warning generici
|
||||
warnings.filterwarnings("ignore")
|
||||
# Silenzia logging di transformers / sentence-transformers / HF hub
|
||||
logging.getLogger("sentence_transformers").setLevel(logging.ERROR)
|
||||
logging.getLogger("transformers").setLevel(logging.ERROR)
|
||||
logging.getLogger("huggingface_hub").setLevel(logging.ERROR)
|
||||
|
||||
print("\n***Step 2 ***\nEmbeddings")
|
||||
model = SentenceTransformer("all-MiniLM-L6-v2")
|
||||
|
||||
with open("main/labeled_embeddings.pkl", "rb") as f:
|
||||
data = pickle.load(f)
|
||||
|
||||
embeddings = data['embeddings'].astype("float32")
|
||||
print("Shape embeddings:", embeddings.shape)
|
||||
|
||||
|
||||
# ----- Step3: Creazione indice FAISS ---
|
||||
dimension = embeddings.shape[1]
|
||||
index = faiss.IndexFlatL2(dimension) # indice L2 (distanza Euclidea)
|
||||
index.add(embeddings)
|
||||
print(f"\n***Step 3: Indice FAISS creato***. \nNumero di vettori nell'indice: {index.ntotal}")
|
||||
|
||||
|
||||
# ----- Step4: Retrieval (similarità) ---
|
||||
# Prova con le prime 500 automazioni non annotate
|
||||
k = 5
|
||||
output_rows = []
|
||||
df_sample = df_unlabeled_filtered.head(500)
|
||||
llm_rows = []
|
||||
|
||||
def sim_label(distance: float) -> str:
|
||||
if distance <= 0.50:
|
||||
return "Match forte"
|
||||
elif distance <= 0.75:
|
||||
return "Match plausibile"
|
||||
elif distance <= 0.90:
|
||||
return "Similarità instabile"
|
||||
else:
|
||||
return "Troppo distante"
|
||||
|
||||
for i, row in df_sample.iterrows():
|
||||
query_text = str(row["human_like"])
|
||||
print("numero corrente:", i)
|
||||
|
||||
# Calcolo embedding della nuova automazione
|
||||
query_emb = model.encode([query_text], convert_to_numpy=True).astype("float32")
|
||||
# Recupera indici dei k vicini più prossimi
|
||||
distances, indices = index.search(query_emb, k)
|
||||
|
||||
# Metriche globali sui top-k (una volta per automazione)
|
||||
topk_cats = []
|
||||
top1_distance = float(distances[0][0])
|
||||
top1_confidence = 1 / (1 + top1_distance)
|
||||
top1_similarity_label = sim_label(top1_distance)
|
||||
|
||||
for rank in range(k):
|
||||
idx = int(indices[0][rank])
|
||||
distance = float(distances[0][rank])
|
||||
confidence = 1 / (1 + distance)
|
||||
label = sim_label(distance)
|
||||
|
||||
retrieved_row = df_labeled.iloc[idx]
|
||||
topk_cats.append(str(retrieved_row["category"]))
|
||||
|
||||
rank1_category = topk_cats[0] if topk_cats else ""
|
||||
majority_category = Counter(topk_cats).most_common(1)[0][0] if topk_cats else ""
|
||||
consistency = (sum(c == majority_category for c in topk_cats) / len(topk_cats)) if topk_cats else 0.0
|
||||
|
||||
for rank in range(k):
|
||||
idx = int(indices[0][rank])
|
||||
distance = float(distances[0][rank])
|
||||
confidence = 1 / (1 + distance)
|
||||
label = sim_label(distance)
|
||||
|
||||
retrieved_row = df_labeled.iloc[idx]
|
||||
|
||||
output_rows.append({
|
||||
# query
|
||||
"automazione da etichettare": query_text,
|
||||
|
||||
# info retrieval per questa riga
|
||||
"rank": rank + 1,
|
||||
"retrieved_idx": idx,
|
||||
"automazione simile": retrieved_row["automation"],
|
||||
"categoria automazione simile": retrieved_row["category"],
|
||||
"distanza": distance,
|
||||
"confidence": round(confidence, 4),
|
||||
"similarity": label,
|
||||
|
||||
# metriche aggregate top-k (ripetute su ogni riga)
|
||||
"rank1_distance": top1_distance,
|
||||
"rank1_confidence": round(top1_confidence, 4),
|
||||
"rank1_similarity_label": top1_similarity_label,
|
||||
"rank1_category": rank1_category,
|
||||
"majority_category": majority_category,
|
||||
"consistency": round(consistency, 3),
|
||||
"top5_categories": " | ".join(topk_cats)
|
||||
})
|
||||
|
||||
|
||||
# --- Step5: invio dati al LLM ---
|
||||
# (1) Costruzione prompt
|
||||
retrieved = df_labeled.iloc[indices[0]].copy()
|
||||
retrieved["distance"] = distances[0].astype(float)
|
||||
retrieved["confidence"] = retrieved["distance"].apply(lambda d: 1 / (1 + float(d)))
|
||||
retrieved["similarity"] = retrieved["distance"].apply(sim_label)
|
||||
prompt = build_prompt_local(query_text, retrieved, sim_label)
|
||||
|
||||
# (2) Chiamata al modello: restituisce JSON
|
||||
resp = client.chat.completions.create(
|
||||
model=deployment,
|
||||
messages=[
|
||||
{"role": "system", "content": "Return ONLY valid JSON. No extra text."},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
temperature=0
|
||||
)
|
||||
content = resp.choices[0].message.content.strip()
|
||||
|
||||
# (3) Parsing della risposta
|
||||
try:
|
||||
parsed = json.loads(content)
|
||||
except Exception:
|
||||
parsed = {
|
||||
"automation": query_text,
|
||||
"category": "",
|
||||
"subcategory": "",
|
||||
"problem_type": "",
|
||||
"gravity": "",
|
||||
"scores": {},
|
||||
"needs_human_review": True,
|
||||
"short_rationale": f"JSON_PARSE_ERROR: {content[:200]}"
|
||||
}
|
||||
|
||||
# (4) Salvataggio di 1 riga per automazione con:
|
||||
# - metriche retrieval (rank1/majority/consistency)
|
||||
# - output dell'LLM (scores + label finale + review flag)
|
||||
llm_category = parsed.get("category", "")
|
||||
llm_subcategory = parsed.get("subcategory", "")
|
||||
llm_problem_type = parsed.get("problem_type", "")
|
||||
llm_gravity = parsed.get("gravity", "")
|
||||
final_category = llm_category
|
||||
final_subcategory = llm_subcategory
|
||||
final_problem_type = llm_problem_type
|
||||
final_gravity = llm_gravity
|
||||
if llm_category.strip().upper() == "HARMLESS":
|
||||
llm_subcategory = ""
|
||||
llm_problem_type = "NONE"
|
||||
llm_gravity = "NONE"
|
||||
|
||||
# ================= HUMAN REVIEW LOGIC =================
|
||||
needs_human_review = bool(parsed.get("needs_human_review", True))
|
||||
OVERRIDE_MAX_DISTANCE = 0.90
|
||||
OVERRIDE_MIN_CONSISTENCY = 0.60
|
||||
# Allineamento forte: LLM = majority = top1
|
||||
aligned_strong = (
|
||||
llm_category == majority_category and
|
||||
llm_category == rank1_category and
|
||||
llm_category != ""
|
||||
)
|
||||
# distanza non eccessiva e buona consistency
|
||||
good_retrieval = (
|
||||
top1_distance <= OVERRIDE_MAX_DISTANCE and
|
||||
consistency >= OVERRIDE_MIN_CONSISTENCY
|
||||
)
|
||||
# allora NON richiede revisione anche se il modello aveva messo True
|
||||
if aligned_strong and good_retrieval:
|
||||
needs_human_review = False
|
||||
# =====================================================
|
||||
|
||||
|
||||
llm_rows.append({
|
||||
"automation_id": row.get("automation_id", ""),
|
||||
"folder": row.get("folder", ""),
|
||||
"automation_text": query_text,
|
||||
|
||||
"rank1_distance": top1_distance,
|
||||
"rank1_confidence": round(top1_confidence, 4),
|
||||
"rank1_similarity_label": top1_similarity_label,
|
||||
"rank1_category": rank1_category,
|
||||
"majority_category": majority_category,
|
||||
"consistency": round(consistency, 3),
|
||||
"top5_categories": " | ".join(topk_cats),
|
||||
|
||||
"llm_category": llm_category,
|
||||
"llm_subcategory": llm_subcategory,
|
||||
"llm_problem_type": llm_problem_type,
|
||||
"llm_gravity": llm_gravity,
|
||||
"llm_needs_human_review": parsed.get("needs_human_review", True),
|
||||
"final_needs_human_review": needs_human_review,
|
||||
|
||||
"final_category": final_category,
|
||||
"final_subcategory": final_subcategory,
|
||||
"final_problem_type": final_problem_type,
|
||||
"final_gravity": final_gravity,
|
||||
|
||||
"llm_rationale": parsed.get("short_rationale", "")
|
||||
})
|
||||
|
||||
|
||||
# --- Step6: integrazione e output ---
|
||||
# (5) Esportare l’output finale come dataframe
|
||||
df_llm = pd.DataFrame(llm_rows)
|
||||
out_path = "main/datasets/labeling_first500.xlsx"
|
||||
df_llm.to_excel(out_path, index=False)
|
||||
|
||||
wb = load_workbook(out_path)
|
||||
ws = wb.active
|
||||
|
||||
# Colori per needs_human_review
|
||||
true_fill = PatternFill(start_color="FF6347", end_color="FF6347", fill_type="solid") # rosso
|
||||
false_fill = PatternFill(start_color="90EE90", end_color="90EE90", fill_type="solid") # verde
|
||||
col_index = {cell.value: idx for idx, cell in enumerate(ws[1], start=1)}
|
||||
if "llm_needs_human_review" in col_index:
|
||||
c = col_index["llm_needs_human_review"]
|
||||
for r in range(2, ws.max_row + 1):
|
||||
val = ws.cell(row=r, column=c).value
|
||||
if val is True:
|
||||
ws.cell(row=r, column=c).fill = true_fill
|
||||
elif val is False:
|
||||
ws.cell(row=r, column=c).fill = false_fill
|
||||
|
||||
if "final_needs_human_review" in col_index:
|
||||
c = col_index["final_needs_human_review"]
|
||||
for r in range(2, ws.max_row + 1):
|
||||
val = ws.cell(row=r, column=c).value
|
||||
if val is True:
|
||||
ws.cell(row=r, column=c).fill = true_fill
|
||||
elif val is False:
|
||||
ws.cell(row=r, column=c).fill = false_fill
|
||||
|
||||
wb.save(out_path)
|
||||
print(f"\n***Step 6: Retrieval e LLM ***\nExcel LLM salvato in {out_path}")
|
||||
|
||||
# --- Conteggio needs_human_review ---
|
||||
review_counts = df_llm["final_needs_human_review"].value_counts(dropna=False)
|
||||
true_count = review_counts.get(True, 0)
|
||||
false_count = review_counts.get(False, 0)
|
||||
print("\n--- Needs human review summary ---")
|
||||
print(f"needs_human_review = True : {true_count}")
|
||||
print(f"needs_human_review = False: {false_count}")
|
||||
|
||||
|
||||
# --- Step7: dataset finale su tutte le automazioni (solo testo + etichette) ---
|
||||
df_final = df_llm[[
|
||||
"automation_text",
|
||||
"llm_category",
|
||||
"llm_subcategory",
|
||||
"llm_gravity",
|
||||
"llm_problem_type",
|
||||
"final_needs_human_review"
|
||||
]].rename(columns={
|
||||
"llm_category": "category",
|
||||
"llm_subcategory": "subcategory",
|
||||
"llm_gravity": "gravity",
|
||||
"llm_problem_type": "problem_type"
|
||||
})
|
||||
|
||||
# Normalizza stringhe
|
||||
for col in ["category", "subcategory", "gravity", "problem_type"]:
|
||||
df_final[col] = df_final[col].fillna("").astype(str).str.strip()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Creazione DataFrame risultati
|
||||
# df_results = pd.DataFrame(output_rows)
|
||||
# output_path = "main/datasets/similarity_analysis.xlsx"
|
||||
# df_results.to_excel(output_path, index=False)
|
||||
|
||||
#wb = load_workbook(output_path)
|
||||
#ws = wb.active
|
||||
|
||||
#distanza_col_idx = None
|
||||
#for idx, cell in enumerate(ws[1], start=1):
|
||||
#if cell.value == "distanza":
|
||||
#distanza_col_idx = idx
|
||||
#break
|
||||
#if distanza_col_idx is None:
|
||||
#raise ValueError("Colonna 'distanza' non trovata!")
|
||||
|
||||
# Applichiamo i colori in base al valore
|
||||
#for row in ws.iter_rows(min_row=2, max_row=ws.max_row, min_col=distanza_col_idx, max_col=distanza_col_idx):
|
||||
#cell = row[0]
|
||||
#try:
|
||||
#val = float(cell.value)
|
||||
#if val < 0.5:
|
||||
#color = "90EE90" # verde chiaro
|
||||
#elif val < 1.0:
|
||||
#color = "FFFF00" # giallo
|
||||
#else:
|
||||
#color = "FF6347" # rosso
|
||||
#cell.fill = PatternFill(start_color=color, end_color=color, fill_type="solid")
|
||||
#except:
|
||||
#continue
|
||||
|
||||
# Salva il file direttamente con colori applicati
|
||||
#wb.save(output_path)
|
||||
#print(f"Excel salvato in {output_path}")
|
||||
|
|
@ -0,0 +1,288 @@
|
|||
task = """
|
||||
You are a security evaluation tool for smart home automation rules.
|
||||
Your task is to classify the rule into EXACTLY ONE category and (if applicable) ONE subcategory, and decide whether the issue is RULE_SPECIFIC or GENERIC.
|
||||
|
||||
You will receive:
|
||||
- The automation rule (text).
|
||||
- Definitions of 4 categories (with subcategories).
|
||||
- Retrieved similar labeled examples (RAG context) with distance-based similarity levels.
|
||||
|
||||
Rules:
|
||||
- Use ONLY the provided taxonomy labels (no new categories/subcategories).
|
||||
- If retrieved examples are weak/unstable or the rule is ambiguous, set needs_human_review=true.
|
||||
- Return ONLY a valid JSON object (no extra text).
|
||||
"""
|
||||
|
||||
taxonomy = """
|
||||
* CATEGORIES AND DEFINITIONS (use ONLY these labels) *
|
||||
|
||||
# CATEGORY 1: PERSONAL PRIVACY VIOLATION
|
||||
Definition: This category includes automations that collect, process, or disseminate personal or sensitive data (own or third-party) in an unnecessary, unintentional, or potentially illicit manner.
|
||||
Necessary rule conditions: the rule collects sensitive information (images, video, audio, location, identity).
|
||||
Risk Conditions:
|
||||
R1: The data is shared with external parties
|
||||
- neighbors
|
||||
- WhatsApp groups
|
||||
- social media
|
||||
- mailing lists
|
||||
- public channels
|
||||
R2: The data is made publicly accessible
|
||||
- publicly visible
|
||||
- public link
|
||||
- open profile
|
||||
R3: The collection or sharing is not necessary for the stated purpose or is not clearly understood
|
||||
- no confirmation
|
||||
- no review before publication
|
||||
R4: The data collection may violate privacy regulations
|
||||
- camera photographing passers-by
|
||||
- audio recording of guests
|
||||
Do not apply if:
|
||||
- The data remains strictly internal and is not shared, exposed, or externally observable.
|
||||
- The automation only updates internal system states without transmitting or publishing personal data.
|
||||
- The data collection is clearly necessary for the rule's primary functionality and remains locally processed.
|
||||
Illustrative examples (for guidance only):
|
||||
- When the camera on my smart doorbell detects an unknown/suspicious person (e.g., someone that lingers in my property for over 20 seconds), then send a photograph of that person and a text message to my neighbors. S+
|
||||
- If I take a new photo, then post it as a publicly accessible image on Instagram
|
||||
|
||||
# CATEGORY 2: PHYSICAL ENVIRONMENT THREAT
|
||||
Definition: This category includes automations that can cause unauthorized access, reduced physical security, or property damage.
|
||||
Sub-categories: 2.1 PROMOTE UNAUTHORIZED ACCESS, 2.2 Device and identity control. 2.3 , 2.4 absence status reporting
|
||||
|
||||
## SUB-CATEGORY 2.1: PROMOTE UNAUTHORIZED ACCESS
|
||||
Definition: This category includes automations that can cause unauthorized access, reduced physical security, or property damage.
|
||||
Necessary rule conditions:
|
||||
- Actions on: windows / doors / locks
|
||||
- Automatic activations based on: environmental conditions / unauthenticated events
|
||||
Risk Conditions:
|
||||
- The action reduces physical protection.
|
||||
- There is no identity check.
|
||||
- The event can be externally induced.
|
||||
Does not apply if:
|
||||
- There are already security measures such as checking the user's presence at home.
|
||||
- The rule only modifies non-security-related elements (e.g., lights, temperature).
|
||||
- The action is manually confirmed before execution.
|
||||
Illustrative examples (for guidance only)::
|
||||
- When the smart thermostat detects that the temperature rises above 25 degrees, then slightly open the window.
|
||||
- If Indoor CO2 goes up, open the window.
|
||||
|
||||
|
||||
## SUB-CATEGORY 2.2: Device and identity control (device-based access)
|
||||
Definition: Automations that grant physical access based solely on the presence of a device, without considering theft, compromise, or old, unremoved devices.
|
||||
Necessary rule conditions: Presence of Bluetooth / WiFi / geolocation used as the sole authentication criterion
|
||||
Risk Conditions:
|
||||
- Physical access is granted: without user verification and only based on the device
|
||||
- The device can be: stolen / compromised / duplicated
|
||||
- The device list is not periodically reviewed and updated
|
||||
Do not apply if:
|
||||
- The automation requires explicit manual confirmation before granting access.
|
||||
- Additional authentication mechanisms are enforced (e.g., PIN, biometric verification, multi-factor authentication).
|
||||
- The device presence is not the sole authentication factor.
|
||||
- The rule does not grant physical access but only sends notifications or status updates.
|
||||
Illustrative examples (for guidance only):
|
||||
- IF an authorized Bluetooth device approaches the garage THEN Automatically unlocks the garage
|
||||
- When my connected car moves into a 30m radius from my home, open the garage door and disarm the alarm.
|
||||
- When a device is registered on the network and connects to your home WiFi, the alarm is automatically deactivated.
|
||||
|
||||
## SUB-CATEGORY 2.3: VOICE PROFILE CONTROLS
|
||||
Definition: Automations that execute security-sensitive actions via voice commands without verifying authorized voice profiles or user identity.
|
||||
Necessary rule conditions:
|
||||
- The automation is triggered by a voice command.
|
||||
- The command affects security-sensitive actions (e.g., unlocking, disarming, disabling protections).
|
||||
Risk Conditions:
|
||||
- The command can be executed by anyone
|
||||
- There is no control over the user's identity
|
||||
- No specific authorized voice profiles are stored
|
||||
Do not apply if:
|
||||
- The voice command triggers only non-security-sensitive actions (e.g., turning lights on/off).
|
||||
- The automation can only be executed from inside the home after physical access has already been established.
|
||||
- The system verifies authorized voice profiles before executing critical actions.
|
||||
- Additional authentication mechanisms are required for security-sensitive operations.
|
||||
Illustrative examples (for guidance only):
|
||||
- IF the voice assistant recognizes the command "Disable alarm" THEN Disable the home security system
|
||||
|
||||
|
||||
## SUB-CATEGORY 2.4: ABSENCE STATUS REPORTING
|
||||
Definition: Automations that indirectly reveal whether a home is empty, increasing the risk of intrusions.
|
||||
Necessary rule conditions:
|
||||
- Actions that: turn lights on/off; modify Wi-Fi/alarms
|
||||
- The actions are related to presence at home
|
||||
Risk Conditions:
|
||||
- The rule allows us to deduce whether the house is empty.
|
||||
- The information is: observable from the outside or shared with third parties.
|
||||
Do not apply if:
|
||||
- The automation is not externally observable.
|
||||
- The information is not shared outside the household.
|
||||
- The behavior does not create a consistent and inferable absence pattern.
|
||||
- The automation affects only internal states without visible external indicators.
|
||||
Illustrative examples (for guidance only):
|
||||
- IF someone is home, THEN turn the light
|
||||
- If the last family member leaves home, then turn off lights
|
||||
- IF I leave home, THEN turn off the WiFi
|
||||
- If I'm the last person in the house and leave, send a notification to my smartwatch if any window in the house is open
|
||||
|
||||
|
||||
# CATEGORY3: CYBERSECURITY HARM
|
||||
Description: This category includes automations that introduce malware exposure, data stream manipulation, or network communication abuse.
|
||||
Sub-categories: SUB-CATEGORY 3.1: MALICIOUS TRAFFIC GENERATION, SUB-CATEGORY 3.2: AUTOMATIC FILE SPREAD, SUB-CATEGORY 3.3: NETWORK COMMUNICATION THREATS
|
||||
|
||||
## SUB-CATEGORY 3.1: MALICIOUS TRAFFIC GENERATION
|
||||
Definition: Automations that can be exploited to generate excessive traffic, false alarms, or denial of service
|
||||
Rule conditions: The event is easily repeatable.
|
||||
Risk conditions:
|
||||
- The event can generate: excessive traffic / false alarms
|
||||
- The event is manipulable
|
||||
Do not apply if:
|
||||
- The event cannot be externally triggered or manipulated.
|
||||
- The notification is rate-limited or protected against repeated activation.
|
||||
- The action does not significantly increase network load or system exposure.
|
||||
Illustrative examples (for guidance only):
|
||||
- If the smart camera detects someone approaching, it automatically sends a text message to my phone
|
||||
|
||||
## SUB-CATEGORY 3.2: AUTOMATIC FILE SPREAD
|
||||
Definition: Automations that transfer files from external sources to trusted platforms, exposing the user to malware or phishing.
|
||||
Rule conditions:
|
||||
The automation involves automatic file download from:
|
||||
- external URLs
|
||||
- email attachments
|
||||
- messaging platforms
|
||||
- third-party APIs
|
||||
The automation stores, uploads, forwards, or makes the file available within:
|
||||
- trusted cloud storage
|
||||
- local systems
|
||||
- shared folders
|
||||
- collaboration platforms
|
||||
Risk conditions:
|
||||
- There is no content check (e.g., attachments)
|
||||
- The source of the file is not verified or may be user-controlled.
|
||||
- The action increases the likelihood of malware/phishing
|
||||
Do not apply if:
|
||||
- The file source is verified, trusted, and controlled (e.g., official governmental APIs).
|
||||
- The automation includes content validation or malware scanning before storage.
|
||||
- The user manually confirms the download before execution.
|
||||
- No file propagation to additional platforms occurs.
|
||||
Illustrative examples (for guidance only):
|
||||
- Add file from URL action from the Dropbox channel when the “Any new attachment in inbox ” trigger from the Gmail channel is activated
|
||||
|
||||
## SUB-CATEGORY 3.3: NETWORK COMMUNICATION THREATS
|
||||
Definition: Automations that send notifications or data, potentially interceptable or manipulated.
|
||||
Rule conditions:
|
||||
- The automation sends data or notifications over: SMS, messaging platforms, email-
|
||||
- The transmitted information relates to security-relevant events, such as absence of occupants, alarm status, door/window state.
|
||||
Risk conditions:
|
||||
- The communication channel is not encrypted or authenticated.
|
||||
- Messages can be intercepted, spoofed, or altered in transit.
|
||||
Do not apply if:
|
||||
- The communication is encrypted and authenticated.
|
||||
- The communication does not expose the system to interception or spoofing risks.
|
||||
- The transmitted data does not expose occupancy, alarm status, or access control states.
|
||||
Illustrative examples (for guidance only):
|
||||
- If the smart camera detects someone approaching, it automatically sends a text message to my phone
|
||||
|
||||
# CATEGORY 4: HARMLESS
|
||||
Definition: automations that do not present safety problems.
|
||||
Conditions:
|
||||
- The rule does not involve personal data
|
||||
- The rule does not modify the physical environment
|
||||
- The rule does not introduce risky network communications
|
||||
- The rule already includes device/user/presence checks
|
||||
Illustrative examples (for guidance only):
|
||||
- If it rains tomorrow, then remind me to bring an umbrella
|
||||
"""
|
||||
|
||||
problem_type_guide = """
|
||||
* PROBLEM TYPE (choose exactly one) *:
|
||||
|
||||
# RULE_SPECIFIC (S): the automation directly leads to a potentially dangerous situation.
|
||||
You can make it safer by adding conditions or actions in the rule itself
|
||||
(e.g., verifying presence at home, identity check, confirmation step).
|
||||
Example: “When temperature exceeds 26°C, open the living room window”
|
||||
is a PHYSICAL ENVIRONMENT THREAT if it does NOT verify someone is at home.
|
||||
|
||||
# GENERIC (G): the automation is not inherently dangerous; risk depends on configuration
|
||||
or contextual factors. The best mitigation is a user behavior recommendation rather
|
||||
than changing the rule logic.
|
||||
Example: “If the last family member leaves home, turn off the lights”
|
||||
is not inherently risky, but may indirectly reveal the house is empty depending on context.
|
||||
"""
|
||||
|
||||
gravity_guide = """
|
||||
* GRAVITY / SEVERITY (choose exactly one) *:
|
||||
|
||||
# HIGH: direct and immediate security/privacy consequence.
|
||||
Examples: automatically opening doors; public photos without consent; malware propagation.
|
||||
|
||||
# MEDIUM: indirect consequence or conditioned on other variables.
|
||||
Examples: absence deducible from light patterns; opening door via Bluetooth/device proximity.
|
||||
|
||||
# LOW: minimal risk, marginal information leakage, or easily mitigable.
|
||||
Examples: notifications that might hint the user is away only if intercepted;
|
||||
downloads from relatively trusted sources with limited exposure.
|
||||
|
||||
# NONE: no security/privacy consequence (comfort rules).
|
||||
Examples: lights/temperature/irrigation/morning routine.
|
||||
"""
|
||||
|
||||
OUTPUT_SCHEMA = """
|
||||
Return ONLY this JSON:
|
||||
|
||||
{
|
||||
"automation": "string",
|
||||
"category": "PERSONAL PRIVACY VIOLATION | PHYSICAL ENVIRONMENT THREAT | CYBERSECURITY HARM | HARMLESS",
|
||||
"subcategory": "one of the defined subcategories for that category, or empty string",
|
||||
"problem_type": "RULE_SPECIFIC | GENERIC | none",
|
||||
"gravity": "LOW | MEDIUM | HIGH | NONE",
|
||||
"scores": {
|
||||
"PERSONAL PRIVACY VIOLATION": 0.0,
|
||||
"PHYSICAL ENVIRONMENT THREAT": 0.0,
|
||||
"CYBERSECURITY HARM": 0.0,
|
||||
"HARMLESS": 0.0
|
||||
},
|
||||
"needs_human_review": true,
|
||||
"short_rationale": "max 2 sentences"
|
||||
}
|
||||
"""
|
||||
|
||||
# trasformare in testo i risultati del retrieval (le 5 automazioni simili + distanza)
|
||||
# il testo viene passato al LLM come esempio
|
||||
def build_examples_text(retrieved_df, distance_band_fn, max_chars=600):
|
||||
parts = []
|
||||
for i, r in enumerate(retrieved_df.iterrows(), start=1):
|
||||
_, r = r
|
||||
d = float(r["distance"])
|
||||
parts.append(
|
||||
f"""Example {i}:
|
||||
Automation: {str(r.get('automation',''))[:max_chars]}
|
||||
Description: {str(r.get('description',''))[:200]}
|
||||
Category: {r.get('category','')}
|
||||
Subcategory: {r.get('subcategory','')}
|
||||
Problem type: {r.get('problem_type','')}
|
||||
Gravity: {r.get('gravity','')}
|
||||
Distance: {d}
|
||||
Similarity level: {distance_band_fn(d)}
|
||||
"""
|
||||
)
|
||||
return "\n".join(parts)
|
||||
|
||||
# costruzione del prompt
|
||||
def build_prompt_local(query_text, retrieved_df, distance_band_fn):
|
||||
top1_dist = float(retrieved_df["distance"].iloc[0])
|
||||
band = distance_band_fn(top1_dist)
|
||||
examples_text = build_examples_text(retrieved_df, distance_band_fn)
|
||||
|
||||
return f"""{task}
|
||||
|
||||
{taxonomy}
|
||||
{problem_type_guide}
|
||||
{gravity_guide}
|
||||
|
||||
AUTOMATION TO LABEL:
|
||||
{query_text}
|
||||
|
||||
TOP1_DISTANCE: {top1_dist}
|
||||
SIMILARITY_BAND: {band}
|
||||
|
||||
RETRIEVED SIMILAR LABELED EXAMPLES (top-k):
|
||||
{examples_text}
|
||||
|
||||
{OUTPUT_SCHEMA}
|
||||
"""
|
||||
Loading…
Reference in New Issue