from datasets import load_dataset from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments import evaluate # 1. Load BiScope dataset dataset = load_dataset("HanxiGuo/BiScope_Data") # 2. Tokenizer MODEL = "microsoft/deberta-v3-small" tokenizer = AutoTokenizer.from_pretrained(MODEL) def preprocess(examples): return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256) encoded_dataset = dataset.map(preprocess, batched=True) # 3. Load model model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels=2) # 4. Metrics accuracy = evaluate.load("accuracy") def compute_metrics(eval_pred): logits, labels = eval_pred predictions = logits.argmax(axis=-1) return accuracy.compute(predictions=predictions, references=labels) # 5. Training args training_args = TrainingArguments( output_dir="./results", evaluation_strategy="epoch", save_strategy="epoch", learning_rate=2e-5, per_device_train_batch_size=16, per_device_eval_batch_size=16, num_train_epochs=2, weight_decay=0.01, push_to_hub=True, # ✅ Upload to HF hub_model_id="your-username/biscope-detector" ) # 6. Trainer trainer = Trainer( model=model, args=training_args, train_dataset=encoded_dataset["train"], eval_dataset=encoded_dataset["validation"], tokenizer=tokenizer, compute_metrics=compute_metrics, ) # 7. Train & Push trainer.train() trainer.push_to_hub()