Amr-h/EOU_Arabic_Saudi
Viewer • Updated • 15k • 21
How to use Amr-h/arabic-eou-marbert with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("text-classification", model="Amr-h/arabic-eou-marbert") # Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained("Amr-h/arabic-eou-marbert")
model = AutoModelForSequenceClassification.from_pretrained("Amr-h/arabic-eou-marbert")This model detects whether a speaker has finished their turn in Arabic conversations, with emphasis on Saudi dialect. It's designed for real-time voice agent applications like LiveKit.
| Label | ID | Description |
|---|---|---|
| INCOMPLETE | 0 | Speaker has not finished their turn |
| COMPLETE | 1 | Speaker has finished their turn |
| Metric | Complete (1) | Incomplete (0) |
|---|---|---|
| Precision | 100.00% | 85.94% |
| Recall | 83.64% | 100.00% |
| F1-Score | 91.09% | 92.44% |
Overall Weighted F1: 91.76%
from transformers import pipeline
# Load the model
eou_detector = pipeline(
"text-classification",
model="Amr-h/arabic-eou-marbert",
device=0 # Use GPU, or -1 for CPU
)
# Detect end of utterance
text = "هل بلغوك انهم بيحتاجون ساعات اضافيه؟"
result = eou_detector(text)[0]
print(f"Label: {result['label']}") # COMPLETE or INCOMPLETE
print(f"Confidence: {result['score']:.2%}")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("Amr-h/arabic-eou-marbert")
tokenizer = AutoTokenizer.from_pretrained("Amr-h/arabic-eou-marbert")
# Inference
text = "انتظر خلني اشوف وين حطيت ال"
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=64)
with torch.no_grad():
outputs = model(**inputs)
prediction = torch.argmax(outputs.logits, dim=-1).item()
label = "COMPLETE" if prediction == 1 else "INCOMPLETE"
print(f"Prediction: {label}")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
class ArabicEOUDetector:
def __init__(self, model_name="Amr-h/arabic-eou-marbert"):
self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model.eval()
def predict(self, text: str) -> tuple[bool, float]:
"""
Returns (is_complete, confidence)
"""
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
max_length=64
)
with torch.no_grad():
outputs = self.model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)
prediction = torch.argmax(probs, dim=-1).item()
confidence = probs[0][prediction].item()
is_complete = prediction == 1
return is_complete, confidence
If you use this model, please cite:
@misc{arabic-eou-marbert,
author = {YOUR_NAME},
title = {Arabic End-of-Utterance Detection Model},
year = {2024},
publisher = {Hugging Face},
url = {https://huggingface.co/Amr-h/arabic-eou-marbert}
}
Apache 2.0
Base model
UBC-NLP/MARBERT