Spaces:
Running
Running
| import gradio as gr | |
| from transformers import pipeline | |
| import librosa | |
| import numpy as np | |
| # --------------------------------------------------------- | |
| # 1. AUDIO-MODELL LADEN (für genaue Erkennung) | |
| # --------------------------------------------------------- | |
| classifier = pipeline( | |
| task="zero-shot-audio-classification", | |
| model="laion/clap-htsat-unfused" | |
| ) | |
| CANDIDATE_LABELS = [ | |
| "dog barking", | |
| "dog growling", | |
| "people talking", | |
| "traffic noise", | |
| "car passing", | |
| "bird singing", | |
| "music", | |
| "silence", | |
| ] | |
| # --------------------------------------------------------- | |
| # 2. FESTE PARAMETER FÜR ENERGIE-ANALYSE | |
| # --------------------------------------------------------- | |
| ENERGY_FRAME_MS = 25 # Frame-Länge für Energie (ms) | |
| ENERGY_HOP_MS = 10 # Schrittweite (ms) | |
| ENERGY_QUANTILE = 0.80 # Lautheitsschwelle (oberes 20%-Quantil) | |
| MIN_EVENT_DURATION = 0.25 # min. Dauer eines lauten Events (Sek.) | |
| MIN_SEGMENT_FOR_CLAP = 0.15 # minimale Segmentlänge, damit CLAP Sinn macht (Sek.) | |
| # --------------------------------------------------------- | |
| # 3. FUNKTION: LAUTE EVENTS FINDEN | |
| # --------------------------------------------------------- | |
| def find_loud_events(y, sr): | |
| frame_length = int(sr * ENERGY_FRAME_MS / 1000) | |
| hop_length = int(sr * ENERGY_HOP_MS / 1000) | |
| frame_length = max(frame_length, 512) | |
| hop_length = max(hop_length, 128) | |
| rms = librosa.feature.rms( | |
| y=y, | |
| frame_length=frame_length, | |
| hop_length=hop_length | |
| )[0] | |
| times = librosa.frames_to_time( | |
| np.arange(len(rms)), | |
| sr=sr, | |
| hop_length=hop_length | |
| ) | |
| thr = np.quantile(rms, ENERGY_QUANTILE) | |
| mask = rms > thr | |
| events = [] | |
| in_event = False | |
| start_t = 0.0 | |
| for i, is_loud in enumerate(mask): | |
| t = times[i] | |
| if is_loud and not in_event: | |
| in_event = True | |
| start_t = t | |
| elif not is_loud and in_event: | |
| end_t = t | |
| if end_t - start_t >= MIN_EVENT_DURATION: | |
| events.append((start_t, end_t)) | |
| in_event = False | |
| if in_event: | |
| end_t = times[-1] | |
| if end_t - start_t >= MIN_EVENT_DURATION: | |
| events.append((start_t, end_t)) | |
| return events | |
| # --------------------------------------------------------- | |
| # 4. BELL-SEGMENTE ERKENNEN | |
| # - entweder „fast mode“ (nur Energie) | |
| # - oder CLAP-basiert, aber gebatcht | |
| # --------------------------------------------------------- | |
| def detect_bark_windows(y, sr, loud_events, bark_prob_threshold, use_clap): | |
| """ | |
| Gibt eine Liste von (start_s, end_s) zurück, die als Bellen gewertet werden. | |
| Wenn use_clap=False: jedes laute Event = Bellen (reine Lautstärke-Logik). | |
| Wenn use_clap=True: CLAP bewertet die Events. | |
| """ | |
| if not use_clap: | |
| # Fast Mode: alles, was laut ist, wird als Bellen gezählt | |
| return [(s, e, 1.0) for (s, e) in loud_events] | |
| # CLAP-Mode: wir batchen die Events für die Pipeline | |
| segments = [] | |
| meta = [] # (start_s, end_s) zu jedem Segment | |
| for (s, e) in loud_events: | |
| if e - s < MIN_SEGMENT_FOR_CLAP: | |
| continue | |
| start_idx = int(s * sr) | |
| end_idx = int(e * sr) | |
| seg = y[start_idx:end_idx] | |
| if len(seg) == 0: | |
| continue | |
| segments.append(seg) | |
| meta.append((s, e)) | |
| if not segments: | |
| return [] | |
| # Batch-Aufruf der Pipeline (deutlich schneller als Einzel-Aufrufe) | |
| results_list = classifier( | |
| segments, | |
| candidate_labels=CANDIDATE_LABELS, | |
| multi_label=True, | |
| batch_size=4, | |
| ) | |
| bark_windows = [] | |
| for (s, e), results in zip(meta, results_list): | |
| bark_score = 0.0 | |
| for r in results: | |
| if r["label"].lower() == "dog barking": | |
| bark_score = float(r["score"]) | |
| break | |
| if bark_score >= bark_prob_threshold: | |
| bark_windows.append((s, e, bark_score)) | |
| return bark_windows | |
| # --------------------------------------------------------- | |
| # 5. HAUPT-ANALYSEFUNKTION | |
| # --------------------------------------------------------- | |
| def analyze_barking(audio_path, max_pause_sec, bark_prob_threshold, fast_mode): | |
| # 0. Upload prüfen | |
| if audio_path is None or audio_path == "": | |
| return "Es wurde keine Audiodatei hochgeladen." | |
| # 1. Audio laden | |
| try: | |
| y, sr = librosa.load(audio_path, sr=16000, mono=True) | |
| except Exception as e: | |
| return f"Fehler beim Laden der Audiodatei: {e}" | |
| duration = len(y) / sr | |
| if duration == 0: | |
| return "Die Audiodatei ist leer." | |
| # 2. Laute Ereignisse finden | |
| loud_events = find_loud_events(y, sr) | |
| if not loud_events: | |
| return "Keine lauten Ereignisse gefunden – vermutlich kein Bellen." | |
| # 3. Bellen-Segmente finden | |
| bark_windows = detect_bark_windows( | |
| y, | |
| sr, | |
| loud_events, | |
| bark_prob_threshold, | |
| use_clap=not fast_mode | |
| ) | |
| if not bark_windows: | |
| if fast_mode: | |
| return ( | |
| "Fast Mode (ohne KI): keine ausreichend lauten Ereignisse, " | |
| "die als Bellen interpretiert wurden." | |
| ) | |
| else: | |
| return ( | |
| "Es wurde kein Hundebellen mit ausreichend hoher Sicherheit erkannt.\n\n" | |
| f"(Schwellwert für 'dog barking' = {bark_prob_threshold:.2f})" | |
| ) | |
| # 4. Bell-Segmente zu Episoden zusammenfassen | |
| bark_windows.sort(key=lambda x: x[0]) # nach Startzeit | |
| episodes = [] | |
| cur_start, cur_end, _ = bark_windows[0] | |
| for s, e, _ in bark_windows[1:]: | |
| if s - cur_end <= max_pause_sec: | |
| cur_end = max(cur_end, e) | |
| else: | |
| episodes.append((cur_start, cur_end)) | |
| cur_start, cur_end = s, e | |
| episodes.append((cur_start, cur_end)) | |
| # 5. Kennzahlen | |
| count = len(episodes) | |
| total_seconds = sum(e2 - e1 for (e1, e2) in episodes) | |
| lines = [] | |
| mode_text = "Fast Mode (nur Energie)" if fast_mode else "CLAP-KI-Modus" | |
| lines.append(f"**Modus:** {mode_text}") | |
| lines.append(f"**A: Anzahl der Bell-Ereignisse:** {count}") | |
| lines.append(f"**B: Gesamtdauer des Bellens:** {total_seconds:.1f} Sekunden\n") | |
| lines.append( | |
| f"_Regel_: > {max_pause_sec:.1f} Sekunden Pause = neues Ereignis\n" | |
| f"_Schwellwert 'dog barking'_: {bark_prob_threshold:.2f}\n" | |
| ) | |
| lines.append("**Details:**") | |
| for i, (s, e) in enumerate(episodes, start=1): | |
| dur = e - s | |
| lines.append(f"- Ereignis {i}: {s:.1f}s bis {e:.1f}s — Dauer: {dur:.1f}s") | |
| return "\n".join(lines) | |
| # --------------------------------------------------------- | |
| # 6. GRADIO UI – MIT SLIDERN & FAST-MODE | |
| # --------------------------------------------------------- | |
| audio_input = gr.Audio(type="filepath", label="Audio hochladen (.wav, .mp3)") | |
| pause_slider = gr.Slider( | |
| minimum=1.0, | |
| maximum=10.0, | |
| value=3.0, | |
| step=0.5, | |
| label="Maximale Pause zwischen Bellen (Sekunden)", | |
| ) | |
| threshold_slider = gr.Slider( | |
| minimum=0.000001, # 1e-6 | |
| maximum=0.9, | |
| value=0.35, | |
| step=0.000001, | |
| label="Schwellwert für 'dog barking' (0.000001–1)", | |
| info="Je kleiner, desto empfindlicher (erkennt sehr leises/fernes Bellen)." | |
| ) | |
| fast_checkbox = gr.Checkbox( | |
| value=False, | |
| label="Fast Mode (nur Lautstärke, ohne KI-Modell – sehr schnell, aber ungenauer)", | |
| ) | |
| demo = gr.Interface( | |
| fn=analyze_barking, | |
| inputs=[audio_input, pause_slider, threshold_slider, fast_checkbox], | |
| outputs=gr.Markdown(), | |
| title="Barking Episode Analyzer (mit Parametern & Fast Mode)", | |
| description=( | |
| "Erkennt Hundebellen in Aufnahmen.\n\n" | |
| "Optionen:\n" | |
| "- **Maximale Pause**: ab welcher Pause ein neues Bell-Ereignis gezählt wird.\n" | |
| "- **Schwellwert**: ab welcher Wahrscheinlichkeit 'dog barking' gezählt wird.\n" | |
| "- **Fast Mode**: nur Lautstärke-Analyse (schnell), ohne 'dog barking'-Modell." | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |