kasanoma_ASR / app.py
Kennethdot's picture
Update app.py
60dd229 verified
Raw
History Blame Contribute Delete
20 kB
import os
import uuid
import shutil
import numpy as np
import pandas as pd
import soundfile as sf
import torch
import torchaudio
import gradio as gr
import threading
from dotenv import load_dotenv
from huggingface_hub import HfApi, create_repo, upload_file
from transformers import WhisperForConditionalGeneration, WhisperProcessor
# =========================================================
# LOAD ENV VARIABLES
# =========================================================
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")
DATASET_REPO = os.getenv("HF_DATASET_REPO")
# =========================================================
# LOCAL STORAGE
# =========================================================
LOCAL_DATASET_DIR = "hf_dataset"
LOCAL_AUDIO_DIR = os.path.join(LOCAL_DATASET_DIR, "audio")
LOCAL_METADATA = os.path.join(LOCAL_DATASET_DIR, "metadata.csv")
os.makedirs(LOCAL_AUDIO_DIR, exist_ok=True)
# =========================================================
# HUGGING FACE SETUP
# =========================================================
api = HfApi(token=HF_TOKEN)
try:
create_repo(
repo_id=DATASET_REPO,
repo_type="dataset",
exist_ok=True,
token=HF_TOKEN
)
except Exception as e:
print("Dataset repo check:", e)
# =========================================================
# LOAD MODEL
# =========================================================
model = WhisperForConditionalGeneration.from_pretrained(
"Kennethdot/kasanoma_whisper"
)
processor = WhisperProcessor.from_pretrained(
"Kennethdot/kasanoma_whisper"
)
device = torch.device(
"cuda" if torch.cuda.is_available() else "cpu"
)
model = model.to(device)
model.eval()
# =========================================================
# TRANSCRIPTION FUNCTION
# =========================================================
def transcribe_audio(audio_path):
if audio_path is None:
return "", "", None
try:
audio_data, sampling_rate = sf.read(audio_path)
# Stereo β†’ mono
if len(audio_data.shape) > 1:
audio_data = np.mean(audio_data, axis=1)
audio_data = audio_data.astype(np.float32)
# Resample
if sampling_rate != 16000:
audio_tensor = torch.tensor(
audio_data,
dtype=torch.float32
)
resampler = torchaudio.transforms.Resample(
orig_freq=sampling_rate,
new_freq=16000
)
audio_data = resampler(audio_tensor).numpy()
sampling_rate = 16000
except Exception as e:
return f"Error reading audio: {e}", "", None
# Normalize
if np.max(np.abs(audio_data)) > 0:
audio_data = audio_data / np.max(np.abs(audio_data))
# Feature extraction
input_features = processor.feature_extractor(
audio_data,
sampling_rate=sampling_rate,
return_tensors="pt"
).input_features.to(device)
# Inference
with torch.no_grad():
generated_ids = model.generate(
input_features,
task="transcribe",
language="yo",
temperature=0.0
)
transcription = processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0].strip()
return transcription, transcription, audio_path
# =========================================================
# SAVE FUNCTION
# =========================================================
# Lock prevents concurrent saves from corrupting the CSV
_csv_lock = threading.Lock()
def _upload_in_background(saved_audio_path, relative_audio_path):
"""Runs in a daemon thread. Uploads audio then CSV β€” never blocks the UI."""
try:
upload_file(
path_or_fileobj=saved_audio_path,
path_in_repo=relative_audio_path,
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
with _csv_lock:
upload_file(
path_or_fileobj=LOCAL_METADATA,
path_in_repo="metadata.csv",
repo_id=DATASET_REPO,
repo_type="dataset",
token=HF_TOKEN
)
except Exception as e:
print(f"[Background upload error] {e}")
def save_sample(audio_path, corrected_text):
if audio_path is None:
return
try:
unique_id = str(uuid.uuid4())
saved_audio_path = os.path.join(
LOCAL_AUDIO_DIR,
f"{unique_id}.wav"
)
# Copy audio locally β€” fast, no network
shutil.copy(audio_path, saved_audio_path)
relative_audio_path = f"audio/{unique_id}.wav"
# Write metadata locally β€” fast, no network
new_row = pd.DataFrame([{
"id": unique_id,
"audio": relative_audio_path,
"transcription": corrected_text,
"language": "twi_en"
}])
with _csv_lock:
if os.path.exists(LOCAL_METADATA):
existing = pd.read_csv(LOCAL_METADATA)
updated = pd.concat([existing, new_row], ignore_index=True)
else:
updated = new_row
updated.to_csv(LOCAL_METADATA, index=False)
# Fire-and-forget β€” HuggingFace upload happens in background
threading.Thread(
target=_upload_in_background,
args=(saved_audio_path, relative_audio_path),
daemon=True
).start()
return
except Exception as e:
print(f"[Save error] {e}")
return
# =========================================================
# CUSTOM CSS (from the styled version)
# =========================================================
css = """
@import url('https://fonts.googleapis.com/css2?family=Sora:wght@300;400;600;700&family=DM+Mono:wght@400;500&display=swap');
:root {
--navy: #0d1b4b;
--blue: #1a6fd4;
--sky: #4db8f0;
--violet: #7b4fd4;
--lilac: #b57bee;
--white: #e8f0ff;
--card: rgba(255,255,255,0.07);
--border: rgba(180,160,255,0.2);
--radius: 16px;
}
/* ── animated mesh background ── */
body, .gradio-container {
background:
radial-gradient(ellipse at 15% 10%, #1a3fa8 0%, transparent 55%),
radial-gradient(ellipse at 85% 5%, #7b3fc4 0%, transparent 45%),
radial-gradient(ellipse at 50% 50%, #d0e8ff 0%, transparent 60%),
radial-gradient(ellipse at 80% 80%, #b57bee 0%, transparent 50%),
radial-gradient(ellipse at 10% 90%, #1a6fd4 0%, transparent 50%),
#0d1b4b !important;
font-family: 'Sora', sans-serif !important;
min-height: 100vh;
}
/* ── slow drifting orbs ── */
.gradio-container::before,
.gradio-container::after {
content: '';
position: fixed;
border-radius: 50%;
filter: blur(80px);
pointer-events: none;
z-index: 0;
}
.gradio-container::before {
width: 520px; height: 520px;
top: -120px; left: -100px;
background: radial-gradient(circle, rgba(74,130,230,0.45), transparent 70%);
animation: drift1 12s ease-in-out infinite alternate;
}
.gradio-container::after {
width: 480px; height: 480px;
bottom: -100px; right: -80px;
background: radial-gradient(circle, rgba(160,100,240,0.4), transparent 70%);
animation: drift2 15s ease-in-out infinite alternate;
}
@keyframes drift1 {
from { transform: translate(0, 0); }
to { transform: translate(60px, 80px); }
}
@keyframes drift2 {
from { transform: translate(0, 0); }
to { transform: translate(-50px, -70px); }
}
/* ── hero ── */
#hero {
text-align: center;
padding: 40px 24px 16px;
position: relative;
z-index: 1;
animation: fadeUp 0.8s ease both;
}
@keyframes fadeUp {
from { opacity: 0; transform: translateY(24px); }
to { opacity: 1; transform: translateY(0); }
}
/* ── animated orb ── */
.orb-wrap {
display: flex;
justify-content: center;
margin-bottom: 22px;
}
.orb-stage {
position: relative;
width: 110px;
height: 110px;
}
.orb-ring {
position: absolute;
border-radius: 50%;
border: 1.5px solid transparent;
inset: 0;
animation: orbSpin 6s linear infinite;
}
.orb-ring:nth-child(1) {
border-top-color: #4db8f0;
border-right-color: rgba(77,184,240,0.25);
animation-duration: 5s;
}
.orb-ring:nth-child(2) {
inset: 8px;
border-top-color: #b57bee;
border-left-color: rgba(181,123,238,0.25);
animation-direction: reverse;
animation-duration: 7s;
}
.orb-ring:nth-child(3) {
inset: 16px;
border-top-color: #1a6fd4;
border-bottom-color: rgba(26,111,212,0.25);
animation-duration: 9s;
}
@keyframes orbSpin { to { transform: rotate(360deg); } }
.orb-core {
position: absolute;
inset: 24px;
border-radius: 50%;
background: radial-gradient(circle at 35% 35%,
rgba(200,220,255,0.95),
rgba(100,150,240,0.8) 40%,
rgba(90,55,200,0.9) 75%,
rgba(25,15,75,1)
);
box-shadow:
0 0 22px 6px rgba(77,184,240,0.45),
0 0 50px 12px rgba(120,80,220,0.28),
inset 0 0 14px rgba(255,255,255,0.18);
animation: orbBreathe 3.5s ease-in-out infinite;
}
.orb-core::after {
content: '';
position: absolute;
top: 14%; left: 22%;
width: 28%; height: 20%;
border-radius: 50%;
background: rgba(255,255,255,0.5);
filter: blur(3px);
}
@keyframes orbBreathe {
0%, 100% {
transform: scale(1);
box-shadow: 0 0 22px 6px rgba(77,184,240,0.45), 0 0 50px 12px rgba(120,80,220,0.28), inset 0 0 14px rgba(255,255,255,0.18);
}
50% {
transform: scale(1.1);
box-shadow: 0 0 34px 12px rgba(77,184,240,0.65), 0 0 70px 20px rgba(120,80,220,0.42), inset 0 0 20px rgba(255,255,255,0.28);
}
}
.orb-waves {
position: absolute;
inset: 24px;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
gap: 3px;
overflow: hidden;
}
.orb-wave-bar {
width: 3px;
border-radius: 99px;
background: rgba(255,255,255,0.75);
animation: waveBar 1.3s ease-in-out infinite;
}
.orb-wave-bar:nth-child(1) { height: 8px; animation-delay: 0s; }
.orb-wave-bar:nth-child(2) { height: 16px; animation-delay: 0.18s; }
.orb-wave-bar:nth-child(3) { height: 22px; animation-delay: 0.35s; }
.orb-wave-bar:nth-child(4) { height: 16px; animation-delay: 0.52s; }
.orb-wave-bar:nth-child(5) { height: 8px; animation-delay: 0.7s; }
@keyframes waveBar {
0%, 100% { transform: scaleY(0.3); opacity: 0.45; }
50% { transform: scaleY(1); opacity: 1; }
}
/* ── title & subtitle ── */
.kasa-title {
font-size: clamp(2.4rem, 7vw, 4rem);
font-weight: 700;
letter-spacing: -0.03em;
background: linear-gradient(120deg, var(--white) 0%, var(--sky) 40%, var(--lilac) 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
margin: 0 0 10px;
line-height: 1.1;
}
.kasa-sub {
font-size: 0.97rem;
font-weight: 300;
color: rgba(220,230,255,0.85);
max-width: 460px;
margin: 0 auto 14px;
line-height: 1.65;
}
.kasa-badge {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 5px 16px;
border-radius: 999px;
border: 1px solid rgba(180,160,255,0.3);
background: rgba(120,100,220,0.12);
font-size: 0.7rem;
font-weight: 500;
letter-spacing: 0.13em;
text-transform: uppercase;
color: var(--lilac);
backdrop-filter: blur(8px);
}
/* ── divider ── */
.kasa-divider {
border: none;
border-top: 1px solid var(--border);
margin: 10px 0 28px;
position: relative;
z-index: 1;
}
/* ── panel labels ── */
.kasa-label {
font-size: 0.7rem;
font-weight: 600;
letter-spacing: 0.14em;
text-transform: uppercase;
color: #a8d4f8;
margin-bottom: 10px;
}
/* ── Gradio field labels ── */
label, .gr-form label, .svelte-1gfkn6j {
color: #c8d8f8 !important;
font-family: 'Sora', sans-serif !important;
font-size: 0.82rem !important;
font-weight: 500 !important;
}
/* ── WHITE audio widget ── */
.gr-audio,
[data-testid="audio"],
.gr-audio > div {
background: #ffffff !important;
border: 1.5px solid rgba(100,140,240,0.4) !important;
border-radius: 14px !important;
box-shadow: 0 4px 28px rgba(26,60,180,0.14) !important;
overflow: hidden !important;
}
[data-testid="audio"] button,
.gr-audio button {
color: #1a6fd4 !important;
background: transparent !important;
}
[data-testid="audio"] svg,
.gr-audio svg {
stroke: #1a6fd4 !important;
fill: none !important;
}
[data-testid="audio"] span,
[data-testid="audio"] .time,
[data-testid="audio"] .duration,
.gr-audio span {
color: #1a3fa8 !important;
font-family: 'DM Mono', monospace !important;
}
[data-testid="audio"] canvas,
.gr-audio canvas {
filter: hue-rotate(195deg) saturate(2) brightness(0.85) !important;
}
[data-testid="audio"] .tabs button,
.gr-audio .tabs button {
color: #1a6fd4 !important;
font-family: 'Sora', sans-serif !important;
font-weight: 600 !important;
}
[data-testid="audio"] .tabs button.selected,
.gr-audio .tabs button.selected {
border-bottom: 2px solid #1a6fd4 !important;
}
/* ── WHITE textbox ── */
textarea, .gr-textbox textarea {
background: #ffffff !important;
border: 1.5px solid rgba(100,140,240,0.4) !important;
border-radius: 12px !important;
color: #0d1b6e !important;
font-family: 'Sora', sans-serif !important;
font-size: 0.95rem !important;
font-weight: 400 !important;
padding: 14px !important;
box-shadow: 0 4px 24px rgba(26,60,180,0.1) !important;
transition: border-color 0.2s, box-shadow 0.2s !important;
line-height: 1.75 !important;
letter-spacing: 0.01em !important;
}
textarea::placeholder {
color: #7a9acc !important;
font-style: italic;
font-family: 'Sora', sans-serif !important;
}
textarea:focus {
border-color: #4db8f0 !important;
box-shadow: 0 0 0 3px rgba(77,184,240,0.2) !important;
outline: none !important;
}
.gr-textbox, [data-testid="textbox"] {
background: #ffffff !important;
border: 1.5px solid rgba(100,140,240,0.4) !important;
border-radius: 14px !important;
box-shadow: 0 4px 28px rgba(26,60,180,0.14) !important;
overflow: hidden !important;
}
/* ── Edit hint text ── */
.edit-hint {
font-size: 0.75rem;
font-weight: 300;
color: rgba(200,215,255,0.6);
margin-top: -6px;
margin-bottom: 8px;
font-style: italic;
letter-spacing: 0.02em;
}
/* ── Buttons ── */
.gr-button-primary, button.primary {
background: linear-gradient(135deg, #1a6fd4 0%, #7b4fd4 100%) !important;
border: none !important;
border-radius: 10px !important;
font-family: 'Sora', sans-serif !important;
font-weight: 600 !important;
font-size: 0.9rem !important;
padding: 12px 28px !important;
color: #ffffff !important;
transition: transform 0.15s, box-shadow 0.15s !important;
box-shadow: 0 4px 22px rgba(100,80,200,0.4) !important;
position: relative;
z-index: 1;
}
.gr-button-primary:hover, button.primary:hover {
transform: translateY(-2px) !important;
box-shadow: 0 8px 32px rgba(120,80,220,0.55) !important;
}
.gr-button-primary:active, button.primary:active {
transform: translateY(0) !important;
}
/* ── Save checkmark button ── */
#save-btn button {
width: 52px !important;
height: 52px !important;
min-width: 52px !important;
border-radius: 50% !important;
padding: 0 !important;
font-size: 1.5rem !important;
background: linear-gradient(135deg, #1a6fd4 0%, #7b4fd4 100%) !important;
border: none !important;
box-shadow: 0 4px 22px rgba(100,80,200,0.4) !important;
color: #fff !important;
transition: transform 0.15s, box-shadow 0.15s !important;
line-height: 1 !important;
}
#save-btn button:hover {
transform: scale(1.12) translateY(-2px) !important;
box-shadow: 0 8px 32px rgba(120,80,220,0.55) !important;
}
#save-btn button:active {
transform: scale(0.96) !important;
}
/* ── footer ── */
.kasa-footer {
text-align: center;
font-size: 0.72rem;
color: rgba(180,190,255,0.58);
padding: 24px 0 32px;
letter-spacing: 0.05em;
position: relative;
z-index: 1;
}
/* ── pulse dot ── */
.pulse-dot {
display: inline-block;
width: 9px; height: 9px;
border-radius: 50%;
background: #ffffff;
box-shadow: 0 0 6px 2px rgba(255,255,255,0.6);
animation: pulse 2s ease-in-out infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; transform: scale(1); }
50% { opacity: 0.3; transform: scale(0.65); }
}
"""
# =========================================================
# UI
# =========================================================
with gr.Blocks(css=css, theme=gr.themes.Base()) as demo:
# ── Hero ──
gr.HTML("""
<div id="hero">
<div class="orb-wrap">
<div class="orb-stage">
<div class="orb-ring"></div>
<div class="orb-ring"></div>
<div class="orb-ring"></div>
<div class="orb-core"></div>
<div class="orb-waves">
<div class="orb-wave-bar"></div>
<div class="orb-wave-bar"></div>
<div class="orb-wave-bar"></div>
<div class="orb-wave-bar"></div>
<div class="orb-wave-bar"></div>
</div>
</div>
</div>
<div class="kasa-title">Kasanoma ASR</div>
<div class="kasa-sub">
Automatic speech recognition for natural English–Twi
code-switched conversations. Speak the way you actually speak.
</div>
<span class="kasa-badge"><span class="pulse-dot"></span>Live Transcription</span>
</div>
""")
# ── Main columns ──
with gr.Row(equal_height=True):
with gr.Column(scale=1):
gr.HTML('<div class="kasa-label">Audio Input</div>')
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or upload audio"
)
transcribe_btn = gr.Button("Transcribe β†’", variant="primary")
with gr.Column(scale=1):
gr.HTML('<div class="kasa-label">Transcription</div>')
model_output = gr.Textbox(
label="",
placeholder="Transcription will appear here…",
lines=3,
interactive=False
)
gr.HTML('<div class="kasa-label" style="margin-top:14px;">Edit</div>')
gr.HTML('<div class="edit-hint">Type what was actually said β€” fix any errors the model made, then hit βœ“ to save.</div>')
with gr.Row(equal_height=True):
corrected_output = gr.Textbox(
label="",
placeholder="Correct the transcription if needed…",
lines=3,
scale=9
)
save_btn = gr.Button("βœ“", elem_id="save-btn", scale=1)
hidden_audio_path = gr.State()
# ── Footer ──
gr.HTML("""
<div class="kasa-footer">
Kasanoma &middot; English&ndash;Twi Code-Switching ASR &middot; Project Kasa 2026
</div>
""")
# ── Event handlers ──
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=[model_output, corrected_output, hidden_audio_path]
)
save_btn.click(
fn=save_sample,
inputs=[hidden_audio_path, corrected_output]
)
# =========================================================
# LAUNCH
# =========================================================
if __name__ == "__main__":
demo.launch(share=True)