analisi esercitazione
This commit is contained in:
parent
85c03b3a1a
commit
4b76e75058
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,476 @@
|
|||
from huggingface_hub import login
|
||||
import os
|
||||
import gc
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
os.environ['HF_HOME'] = './cache_huggingface' # or just "." for directly in current folder
|
||||
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
||||
|
||||
# Login into Hugging Face Hub
|
||||
hf_token = "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"#userdata.get('gemma3') # If you are running inside a Google Colab
|
||||
print("Logging into Hugging Face Hub...")
|
||||
login(hf_token)
|
||||
print("Logged in.")
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
|
||||
# System message for the assistant
|
||||
system_message = "You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for images on webpages are appropriate according to WCAG guidelines."
|
||||
|
||||
# User prompt that combines the user query and the schema
|
||||
user_prompt = """Create the most appropriate new alt-text given the image, the <HTML context>, and the current <alt-text>. Keep this within 30 words. Use the same language as the original alt-text.
|
||||
Only return the new alt-text.
|
||||
|
||||
<alt-text>
|
||||
{alttext}
|
||||
</alt-text>
|
||||
|
||||
<HTML context>
|
||||
{HTML_context}
|
||||
</HTML context>
|
||||
|
||||
"""
|
||||
|
||||
def download_hf_model(model_id, output_dir="./hf_model"):
|
||||
"""Download model from Hugging Face"""
|
||||
print(f"Downloading {model_id} from Hugging Face...")
|
||||
model_path = snapshot_download(
|
||||
repo_id=model_id,
|
||||
local_dir=output_dir,
|
||||
local_dir_use_symlinks=False
|
||||
)
|
||||
print(f"Model downloaded to: {model_path}")
|
||||
return model_path
|
||||
|
||||
def convert_to_gguf(model_path, output_path="./model.gguf"):
|
||||
"""
|
||||
Convert model to GGUF format using llama.cpp
|
||||
|
||||
Note: You need llama.cpp installed and convert.py script
|
||||
Clone from: https://github.com/ggerganov/llama.cpp
|
||||
"""
|
||||
print("Converting to GGUF format...")
|
||||
|
||||
# This assumes you have llama.cpp cloned and convert.py available
|
||||
# Adjust the path to your llama.cpp installation
|
||||
convert_script = "./llama.cpp/convert_hf_to_gguf.py" # Path to llama.cpp convert.py
|
||||
|
||||
cmd = [
|
||||
"python", convert_script,
|
||||
model_path,
|
||||
"--outfile", output_path,
|
||||
"--outtype", "f16" # Use f16 for better quality, q4_0 for smaller size
|
||||
]
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
print(f"GGUF model created: {output_path}")
|
||||
except FileNotFoundError:
|
||||
print("Error: llama.cpp convert.py not found.")
|
||||
print("Please clone llama.cpp: git clone https://github.com/ggerganov/llama.cpp")
|
||||
return None
|
||||
|
||||
return output_path
|
||||
|
||||
def create_modelfile(model_name, gguf_path, template=None):
|
||||
"""Create Ollama Modelfile"""
|
||||
modelfile_content = f"""FROM {gguf_path}
|
||||
|
||||
# Set parameters
|
||||
PARAMETER temperature 0.7
|
||||
PARAMETER top_p 0.9
|
||||
PARAMETER top_k 40
|
||||
|
||||
# Set the prompt template (adjust based on your model)
|
||||
TEMPLATE """
|
||||
|
||||
if template:
|
||||
modelfile_content += f'"""{template}"""'
|
||||
else:
|
||||
# Default template for chat models
|
||||
modelfile_content += '''"""{{ if .System }}System: {{ .System }}
|
||||
{{ end }}{{ if .Prompt }}User: {{ .Prompt }}
|
||||
{{ end }}Assistant: """'''
|
||||
|
||||
modelfile_path = model_name + "Modelfile"
|
||||
with open(modelfile_path, "w") as f:
|
||||
f.write(modelfile_content)
|
||||
|
||||
print(f"Modelfile created: {modelfile_path}")
|
||||
return modelfile_path
|
||||
|
||||
|
||||
|
||||
|
||||
# NB: inferenza fatta con input immagine e i due campi testuali (e stessa instruction del finetuning)
|
||||
def generate_description(dataset, model, processor):
|
||||
print("Generating description...")
|
||||
# Convert sample into messages and then apply the chat template
|
||||
"""messages = [
|
||||
{"role": "system", "content": [{"type": "text", "text": system_message}]},
|
||||
{"role": "user", "content": [
|
||||
{"type": "image","image": sample["image"]},
|
||||
{"type": "text", "text": user_prompt.format(product=sample["product_name"], category=sample["category"])},
|
||||
]},
|
||||
]"""
|
||||
|
||||
### prendo il primo elemento come test
|
||||
#image_inputs=dataset[0]["image"]#non è una lista ma per il resto è uguale a sotto
|
||||
#print("image_inputs_pre:", image_inputs)
|
||||
format_data_example=format_data(dataset[0])
|
||||
messages=format_data_example["messages"][0:2]# non gli passo la parte assistant (la risposta attesa) come fa nell'esempio HF
|
||||
print("User message:", messages)
|
||||
text = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
# Process the image and text
|
||||
image_inputs = process_vision_info(messages)# converte immagine in rgb anche se sembra lo faccia già sopra nel sample .convert("RGB")
|
||||
print("image_inputs:", image_inputs)
|
||||
|
||||
# Tokenize the text and process the images
|
||||
inputs = processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
# Move the inputs to the device
|
||||
inputs = inputs.to(model.device)
|
||||
|
||||
# Generate the output
|
||||
stop_token_ids = [processor.tokenizer.eos_token_id, processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")]
|
||||
generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8, eos_token_id=stop_token_ids, disable_compile=True)
|
||||
# Trim the generation and decode the output to text
|
||||
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
||||
output_text = processor.batch_decode(
|
||||
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
)
|
||||
return output_text[0]
|
||||
|
||||
# Convert dataset to OAI messages
|
||||
def format_data(sample):
|
||||
return {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": system_message}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": user_prompt.format(
|
||||
HTML_context=sample["html_context"],
|
||||
alttext=sample["alt_text"],
|
||||
#accessibility_expert_alt_text_assessment=sample["original_alt_text_assessment"],
|
||||
#accessibility_expert_alt_text_comments=sample["evaluation_result"]
|
||||
|
||||
|
||||
|
||||
),
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": sample["image"].convert("RGB"), #.convert("RGB") necessario??
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": sample["new_alt_text"]}],#vedi ruolo assistente per la risposta aspettata
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
|
||||
def process_vision_info(messages: list[dict]) -> list[Image.Image]:
|
||||
print("Processing vision info...")
|
||||
image_inputs = []
|
||||
# Iterate through each conversation
|
||||
for msg in messages:
|
||||
# Get content (ensure it's a list)
|
||||
content = msg.get("content", [])
|
||||
if not isinstance(content, list):
|
||||
content = [content]
|
||||
|
||||
# Check each content element for images
|
||||
for element in content:
|
||||
if isinstance(element, dict) and (
|
||||
"image" in element or element.get("type") == "image"
|
||||
):
|
||||
# Get the image and convert to RGB
|
||||
if "image" in element:
|
||||
image = element["image"]
|
||||
else:
|
||||
image = element
|
||||
image_inputs.append(image.convert("RGB"))#converte in rgb !
|
||||
return image_inputs
|
||||
|
||||
print("Loading dataset...")
|
||||
# Load dataset from the hub
|
||||
#dataset = load_dataset("philschmid/amazon-product-descriptions-vlm", split="train",cache_dir="./dataset_cache")
|
||||
dataset = load_dataset("nicolaleo/LLM-alt-text-assessment", split="train",cache_dir="./dataset_cache")
|
||||
|
||||
|
||||
from copy import deepcopy
|
||||
|
||||
dataset_copy=deepcopy(dataset)
|
||||
|
||||
|
||||
|
||||
# Convert dataset to OAI messages
|
||||
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
|
||||
dataset = [format_data(sample) for sample in dataset]
|
||||
|
||||
|
||||
print(dataset[0]["messages"])
|
||||
|
||||
import torch
|
||||
torch.cuda.get_device_capability()
|
||||
|
||||
print("Freeing up memory...")
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
# Get free memory in bytes
|
||||
free_memory = torch.cuda.mem_get_info()[0]
|
||||
total_memory = torch.cuda.mem_get_info()[1]
|
||||
|
||||
# Convert to GB for readability
|
||||
free_gb = free_memory / (1024**3)
|
||||
total_gb = total_memory / (1024**3)
|
||||
|
||||
print(f"Free: {free_gb:.2f} GB / Total: {total_gb:.2f} GB")
|
||||
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
|
||||
|
||||
# Hugging Face model id
|
||||
model_id = "google/gemma-3-4b-it"#"google/gemma-3-4b-pt"#"google/gemma-3-4b-pt" # or `google/gemma-3-12b-pt`, `google/gemma-3-27-pt`
|
||||
|
||||
# Check if GPU benefits from bfloat16
|
||||
#if torch.cuda.get_device_capability()[0] < 8:
|
||||
# raise ValueError("GPU does not support bfloat16, please use a GPU that supports bfloat16.")
|
||||
|
||||
# Define model init arguments
|
||||
model_kwargs = dict(
|
||||
attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
|
||||
torch_dtype=torch.bfloat16,#torch.float16,#torch.bfloat16, # What torch dtype to use, defaults to auto
|
||||
device_map="auto", # Let torch decide how to load the model
|
||||
|
||||
)
|
||||
|
||||
# BitsAndBytesConfig int-4 config
|
||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
|
||||
bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
|
||||
)
|
||||
|
||||
# Load model and tokenizer
|
||||
#model = AutoModelForImageTextToText.from_pretrained(model_id, **model_kwargs)
|
||||
#processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")
|
||||
|
||||
|
||||
|
||||
|
||||
# Set the cache directory to current folder
|
||||
cache_dir = "./model_cache" # or just "." for directly in current folder
|
||||
|
||||
print("Loading model... This may take a while.")
|
||||
model = AutoModelForImageTextToText.from_pretrained(# versione quantizzata 4bit
|
||||
model_id,
|
||||
cache_dir=cache_dir,
|
||||
**model_kwargs
|
||||
)
|
||||
print("Model loaded.")
|
||||
|
||||
|
||||
proc_cache_dir = "./proc_cache"
|
||||
print("Loading processor...")
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"google/gemma-3-4b-it",#model_id, # nel file originale prende -it e non -pt (cambia poco comunque)
|
||||
cache_dir=proc_cache_dir
|
||||
)
|
||||
print("Processor loaded.")
|
||||
|
||||
|
||||
print("testing the loaded model...")
|
||||
# generate the description
|
||||
description = generate_description(dataset_copy, model, processor)
|
||||
print("text generated:",description)
|
||||
|
||||
|
||||
# Download and save to current folder
|
||||
print("Saving model and processor locally...")
|
||||
save_path = "./original_local_model_"+model_id.replace("/", "_")
|
||||
model.save_pretrained(save_path)
|
||||
processor.save_pretrained(save_path)
|
||||
print("Model and processor saved.")
|
||||
|
||||
|
||||
""" # la convesrione in ollama funziona solo se fatta su modello non quantizzato (da capire se si può fare su modello 4bit)
|
||||
print("Converting and importing model to Ollama...")
|
||||
# Step 1: Download from Hugging Face
|
||||
model_path= "./original_local_model_ollama"
|
||||
model_path = download_hf_model(model_id,output_dir=model_path)
|
||||
|
||||
# Step 2: Convert to GGUF (requires llama.cpp)
|
||||
gguf_path = convert_to_gguf(model_path, "./gemma.gguf")
|
||||
|
||||
if gguf_path:
|
||||
# Step 3: Create Modelfile
|
||||
OLLAMA_MODEL_NAME = "gemma3-wcag"
|
||||
modelfile = create_modelfile(OLLAMA_MODEL_NAME, gguf_path)
|
||||
|
||||
"""
|
||||
|
||||
|
||||
|
||||
from peft import LoraConfig
|
||||
|
||||
peft_config = LoraConfig(
|
||||
lora_alpha=16,
|
||||
lora_dropout=0.05,
|
||||
r=16,
|
||||
bias="none",
|
||||
target_modules="all-linear",
|
||||
task_type="CAUSAL_LM",
|
||||
#modules_to_save=[ #quello che mi prendeva memoria in più
|
||||
# "lm_head",
|
||||
# "embed_tokens",
|
||||
#],
|
||||
)
|
||||
|
||||
from trl import SFTConfig
|
||||
|
||||
args = SFTConfig(
|
||||
output_dir="./gemma-finetuned-wcag_"+model_id.replace("/", "_"), # directory to save and repository id
|
||||
num_train_epochs=1, # number of training epochs
|
||||
per_device_train_batch_size=1, # batch size per device during training
|
||||
gradient_accumulation_steps=4, # number of steps before performing a backward/update pass
|
||||
gradient_checkpointing=True, # use gradient checkpointing to save memory
|
||||
optim="adamw_torch_fused", # use fused adamw optimizer
|
||||
logging_steps=5, # log every 5 steps
|
||||
save_strategy="epoch", # save checkpoint every epoch
|
||||
learning_rate=2e-4, # learning rate, based on QLoRA paper
|
||||
bf16=True,#False,#True, # use bfloat16 precision
|
||||
max_grad_norm=0.3, # max gradient norm based on QLoRA paper
|
||||
warmup_ratio=0.03, # warmup ratio based on QLoRA paper
|
||||
lr_scheduler_type="constant", # use constant learning rate scheduler
|
||||
push_to_hub=True, # push model to hub
|
||||
report_to="tensorboard", # report metrics to tensorboard
|
||||
gradient_checkpointing_kwargs={
|
||||
"use_reentrant": False
|
||||
}, # use reentrant checkpointing
|
||||
dataset_text_field="", # need a dummy field for collator
|
||||
dataset_kwargs={"skip_prepare_dataset": True}, # important for collator
|
||||
)
|
||||
args.remove_unused_columns = False # important for collator
|
||||
|
||||
# Create a data collator to encode text and image pairs
|
||||
def collate_fn(examples):
|
||||
texts = []
|
||||
images = []
|
||||
for example in examples:
|
||||
image_inputs = process_vision_info(example["messages"])
|
||||
text = processor.apply_chat_template(
|
||||
example["messages"], add_generation_prompt=False, tokenize=False
|
||||
)
|
||||
texts.append(text.strip())
|
||||
images.append(image_inputs)
|
||||
|
||||
# Tokenize the texts and process the images
|
||||
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
|
||||
|
||||
# The labels are the input_ids, and we mask the padding tokens and image tokens in the loss computation
|
||||
labels = batch["input_ids"].clone()
|
||||
|
||||
# Mask image tokens
|
||||
image_token_id = [
|
||||
processor.tokenizer.convert_tokens_to_ids(
|
||||
processor.tokenizer.special_tokens_map["boi_token"]
|
||||
)
|
||||
]
|
||||
# Mask tokens for not being used in the loss computation
|
||||
labels[labels == processor.tokenizer.pad_token_id] = -100
|
||||
labels[labels == image_token_id] = -100
|
||||
labels[labels == 262144] = -100
|
||||
|
||||
batch["labels"] = labels
|
||||
return batch
|
||||
|
||||
from trl import SFTTrainer
|
||||
|
||||
trainer = SFTTrainer(
|
||||
model=model,
|
||||
args=args,
|
||||
train_dataset=dataset,
|
||||
peft_config=peft_config,
|
||||
processing_class=processor,
|
||||
data_collator=collate_fn,
|
||||
)
|
||||
|
||||
print("Starting training...")
|
||||
# Start training, the model will be automatically saved to the Hub and the output directory
|
||||
trainer.train()
|
||||
|
||||
print("Training completed.")
|
||||
# Save the final model again to the Hugging Face Hub
|
||||
trainer.save_model()# non ho capito questo cosa fa
|
||||
|
||||
# free the memory again
|
||||
del model
|
||||
del trainer
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
from peft import PeftModel
|
||||
|
||||
# Load Model base model
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id, low_cpu_mem_usage=True,cache_dir=cache_dir)
|
||||
|
||||
# Merge LoRA and base model and save
|
||||
peft_model = PeftModel.from_pretrained(model, args.output_dir)
|
||||
merged_model = peft_model.merge_and_unload()
|
||||
merged_model.save_pretrained("merged_model_"+model_id.replace("/", "_"), safe_serialization=True, max_shard_size="2GB")
|
||||
|
||||
processor = AutoProcessor.from_pretrained(args.output_dir)
|
||||
processor.save_pretrained("merged_model_"+model_id.replace("/", "_"))
|
||||
|
||||
|
||||
print("Loading merged model for inference...")
|
||||
# Load Model with PEFT adapter
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
args.output_dir,# dovrebbe essere "./merged_model" e non ./gemma-finetuned-wcag. infatti nel test uso ./merged_model
|
||||
device_map="auto",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="eager",
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(args.output_dir)
|
||||
|
||||
|
||||
print("testing the merged model...")
|
||||
|
||||
|
||||
"""
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
# Test sample with Product Name, Category and Image
|
||||
sample = {
|
||||
"product_name": "Hasbro Marvel Avengers-Serie Marvel Assemble Titan-Held, Iron Man, 30,5 cm Actionfigur",
|
||||
"category": "Toys & Games | Toy Figures & Playsets | Action Figures",
|
||||
"image": Image.open(requests.get("https://m.media-amazon.com/images/I/81+7Up7IWyL._AC_SY300_SX300_.jpg", stream=True).raw).convert("RGB")
|
||||
}
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# generate the description
|
||||
description = generate_description(dataset_copy, model, processor)
|
||||
print("text generated:",description)
|
||||
|
|
@ -6,4 +6,5 @@ numpy==2.2.6
|
|||
matplotlib==3.10.7
|
||||
scikit-learn==1.7.2
|
||||
sentence-transformers==5.1.2
|
||||
datasets==4.4.1
|
||||
datasets==4.4.1
|
||||
bert-score==0.3.13
|
||||
|
|
@ -0,0 +1,280 @@
|
|||
from huggingface_hub import login
|
||||
import os
|
||||
from datasets import load_dataset
|
||||
from PIL import Image
|
||||
import torch
|
||||
from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
|
||||
import gc
|
||||
|
||||
# System message for the assistant
|
||||
system_message = "You are a web accessibility evaluation tool. Your task is to evaluate if alterative text for images on webpages are appropriate according to WCAG guidelines."
|
||||
|
||||
# User prompt that combines the user query and the schema
|
||||
user_prompt = """Create the most appropriate new alt-text given the image, the <HTML context>, and the current <alt-text>. Keep this within 30 words. Use the same language as the original alt-text.
|
||||
Only return the new alt-text.
|
||||
|
||||
<alt-text>
|
||||
{alttext}
|
||||
</alt-text>
|
||||
|
||||
<HTML context>
|
||||
{HTML_context}
|
||||
</HTML context>
|
||||
|
||||
"""
|
||||
def process_vision_info(messages: list[dict]) -> list[Image.Image]:
|
||||
#print("Processing vision info...")
|
||||
image_inputs = []
|
||||
# Iterate through each conversation
|
||||
for msg in messages:
|
||||
# Get content (ensure it's a list)
|
||||
content = msg.get("content", [])
|
||||
if not isinstance(content, list):
|
||||
content = [content]
|
||||
|
||||
# Check each content element for images
|
||||
for element in content:
|
||||
if isinstance(element, dict) and (
|
||||
"image" in element or element.get("type") == "image"
|
||||
):
|
||||
# Get the image and convert to RGB
|
||||
if "image" in element:
|
||||
image = element["image"]
|
||||
else:
|
||||
image = element
|
||||
image_inputs.append(image.convert("RGB"))#converte in rgb !
|
||||
return image_inputs
|
||||
|
||||
def format_data(sample):
|
||||
return {
|
||||
"messages": [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [{"type": "text", "text": system_message}],
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": user_prompt.format(
|
||||
HTML_context=sample["html_context"],
|
||||
alttext=sample["alt_text"],
|
||||
#accessibility_expert_alt_text_assessment=sample["original_alt_text_assessment"],
|
||||
#accessibility_expert_alt_text_comments=sample["evaluation_result"]
|
||||
|
||||
|
||||
|
||||
),
|
||||
},
|
||||
{
|
||||
"type": "image",
|
||||
"image": sample["image"].convert("RGB"), #.convert("RGB") necessario??
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [{"type": "text", "text": sample["new_alt_text"]}],#vedi ruolo assistente per la risposta aspettata
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
def generate_description(dataset, model, processor,example_idx=0):
|
||||
print("Generating description...")
|
||||
# Convert sample into messages and then apply the chat template
|
||||
"""messages = [
|
||||
{"role": "system", "content": [{"type": "text", "text": system_message}]},
|
||||
{"role": "user", "content": [
|
||||
{"type": "image","image": sample["image"]},
|
||||
{"type": "text", "text": user_prompt.format(product=sample["product_name"], category=sample["category"])},
|
||||
]},
|
||||
]"""
|
||||
|
||||
### prendo il primo elemento come test
|
||||
#image_inputs=dataset[0]["image"]#non è una lista ma per il resto è uguale a sotto
|
||||
#print("image_inputs_pre:", image_inputs)
|
||||
format_data_example=format_data(dataset[example_idx])
|
||||
messages=format_data_example["messages"][0:2]# non gli passo la parte assistant (la risposta attesa) come fa nell'esempio HF
|
||||
#print("User message:", messages)
|
||||
text = processor.apply_chat_template(
|
||||
messages, tokenize=False, add_generation_prompt=True
|
||||
)
|
||||
# Process the image and text
|
||||
image_inputs = process_vision_info(messages)# converte immagine in rgb anche se sembra lo faccia già sopra nel sample .convert("RGB")
|
||||
#print("image_inputs:", image_inputs)
|
||||
|
||||
# Tokenize the text and process the images
|
||||
inputs = processor(
|
||||
text=[text],
|
||||
images=image_inputs,
|
||||
padding=True,
|
||||
return_tensors="pt",
|
||||
)
|
||||
# Move the inputs to the device
|
||||
inputs = inputs.to(model.device)
|
||||
|
||||
# Generate the output
|
||||
stop_token_ids = [processor.tokenizer.eos_token_id, processor.tokenizer.convert_tokens_to_ids("<end_of_turn>")]
|
||||
generated_ids = model.generate(**inputs, max_new_tokens=256, top_p=1.0, do_sample=True, temperature=0.8, eos_token_id=stop_token_ids, disable_compile=True)
|
||||
# Trim the generation and decode the output to text
|
||||
generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
|
||||
output_text = processor.batch_decode(
|
||||
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
||||
)
|
||||
return output_text[0]
|
||||
|
||||
from peft import PeftModel
|
||||
|
||||
|
||||
|
||||
os.environ['HF_HOME'] = './cache_huggingface' # or just "." for directly in current folder
|
||||
#os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
|
||||
|
||||
# Login into Hugging Face Hub
|
||||
hf_token = "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"#userdata.get('gemma3') # If you are running inside a Google Colab
|
||||
print("Logging into Hugging Face Hub...")
|
||||
login(hf_token)
|
||||
print("Logged in.")
|
||||
|
||||
|
||||
|
||||
|
||||
model_id = "google/gemma-3-4b-it"
|
||||
output_dir="./merged_model"#"./gemma-finetuned-wcag"
|
||||
|
||||
|
||||
dataset = load_dataset("nicolaleo/LLM-alt-text-assessment", split="train",cache_dir="./dataset_cache")
|
||||
from copy import deepcopy
|
||||
|
||||
dataset_copy=deepcopy(dataset)
|
||||
|
||||
|
||||
cache_dir = "./model_cache"
|
||||
proc_cache_dir = "./proc_cache"
|
||||
|
||||
|
||||
model_kwargs = dict(
|
||||
attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
|
||||
torch_dtype=torch.bfloat16,#torch.float16,#torch.bfloat16, # What torch dtype to use, defaults to auto
|
||||
device_map="auto", # Let torch decide how to load the model
|
||||
|
||||
)
|
||||
|
||||
# BitsAndBytesConfig int-4 config
|
||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||
load_in_4bit=True,
|
||||
bnb_4bit_use_double_quant=True,
|
||||
bnb_4bit_quant_type="nf4",
|
||||
bnb_4bit_compute_dtype=model_kwargs["torch_dtype"],
|
||||
bnb_4bit_quant_storage=model_kwargs["torch_dtype"],
|
||||
)
|
||||
|
||||
|
||||
print("Freeing up memory...")
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
# Load Model base model
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id,cache_dir=cache_dir)
|
||||
print("Model loaded #1")
|
||||
#print(model)
|
||||
|
||||
#load pre-trained processor
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"google/gemma-3-4b-it",#model_id, # nel file originale prende -it e non -pt (cambia poco comunque)
|
||||
cache_dir=proc_cache_dir
|
||||
)
|
||||
print("Processor loaded #1")
|
||||
|
||||
print("testing the model #1...")
|
||||
# generate the description
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=0)
|
||||
print("-text generated 1:",description)
|
||||
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=1)
|
||||
print("-text generated 2:",description)
|
||||
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=20)
|
||||
print("-text generated 3:",description)
|
||||
|
||||
print("Freeing up memory...")
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
del model
|
||||
|
||||
#load Model with 4bit quantization
|
||||
model = AutoModelForImageTextToText.from_pretrained(model_id,cache_dir=cache_dir, **model_kwargs)
|
||||
print("\n Model loaded #2 with 4bit quantization")
|
||||
#print(model)
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
"google/gemma-3-4b-it",#model_id, # nel file originale prende -it e non -pt (cambia poco comunque)
|
||||
cache_dir=proc_cache_dir
|
||||
)
|
||||
print("Processor loaded #2")
|
||||
|
||||
print("testing the model #2 with 4bit quantization...")
|
||||
# generate the description
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=0)
|
||||
print("-text generated 1:",description)
|
||||
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=1)
|
||||
print("-text generated 2:",description)
|
||||
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=20)
|
||||
print("-text generated 3:",description)
|
||||
|
||||
"""
|
||||
# Merge LoRA and base model and save
|
||||
peft_model = PeftModel.from_pretrained(model, output_dir)
|
||||
merged_model = peft_model.merge_and_unload()
|
||||
merged_model.save_pretrained("merged_model", safe_serialization=True, max_shard_size="2GB")
|
||||
|
||||
processor = AutoProcessor.from_pretrained(output_dir)
|
||||
processor.save_pretrained("merged_model")
|
||||
|
||||
|
||||
print("Loading merged model for inference...")
|
||||
# Load Model with PEFT adapter
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
output_dir,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="eager",
|
||||
)
|
||||
processor = AutoProcessor.from_pretrained(output_dir)
|
||||
print("Model loaded #2")
|
||||
print(model)
|
||||
"""
|
||||
|
||||
print("Freeing up memory...")
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
del model
|
||||
# Load Model with PEFT adapter
|
||||
model = AutoModelForImageTextToText.from_pretrained(
|
||||
output_dir,
|
||||
device_map="auto",
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation="eager",
|
||||
)
|
||||
print("\n Model loaded #3")
|
||||
processor = AutoProcessor.from_pretrained(output_dir)
|
||||
print("Processor loaded #3")
|
||||
#print(model)
|
||||
|
||||
|
||||
print("testing the Merged model #3 ...")
|
||||
|
||||
|
||||
#dataset = [format_data(sample) for sample in dataset]
|
||||
|
||||
# generate the description
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=0)
|
||||
print("-text generated 1:",description)
|
||||
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=1)
|
||||
print("-text generated 2:",description)
|
||||
|
||||
description = generate_description(dataset_copy, model, processor,example_idx=20)
|
||||
print("-text generated 3:",description)
|
||||
|
|
@ -0,0 +1,36 @@
|
|||
|
||||
import numpy as np
|
||||
from transformers import BertTokenizer, BertModel
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
import torch
|
||||
from bert_score import score
|
||||
|
||||
def cosine_similarity(a, b):
|
||||
return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
|
||||
|
||||
def semantic_similarity(text1, text2):
|
||||
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
|
||||
model = BertModel.from_pretrained('bert-base-uncased')
|
||||
|
||||
inputs1 = tokenizer(text1, return_tensors='pt')
|
||||
inputs2 = tokenizer(text2, return_tensors='pt')
|
||||
|
||||
with torch.no_grad():
|
||||
outputs1 = model(**inputs1)
|
||||
outputs2 = model(**inputs2)
|
||||
|
||||
embedding1 = outputs1.last_hidden_state.mean(dim=1).squeeze().numpy()
|
||||
embedding2 = outputs2.last_hidden_state.mean(dim=1).squeeze().numpy()
|
||||
|
||||
return cosine_similarity(embedding1, embedding2)
|
||||
|
||||
def lexical_similarity(text1, text2):
|
||||
vectorizer = TfidfVectorizer(stop_words=None, analyzer='char', ngram_range=(1, 3))
|
||||
tfidf_matrix = vectorizer.fit_transform([text1, text2])
|
||||
vec1 = tfidf_matrix.toarray()[0]
|
||||
vec2 = tfidf_matrix.toarray()[1]
|
||||
return cosine_similarity(vec1, vec2)
|
||||
|
||||
def bert_score_similarity(texts1, texts2):
|
||||
P, R, F1 = score(texts1, texts2, lang='en', verbose=False, model_type='bert-base-uncased',device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
|
||||
return F1.numpy()
|
||||
Loading…
Reference in New Issue