File size: 8,037 Bytes
18903e7
5024d94
1200cd9
 
18903e7
735eee4
2711c10
735eee4
 
5024d94
b2852e5
 
5024d94
18903e7
b2852e5
 
 
 
 
 
 
 
 
 
 
735eee4
 
 
b2852e5
735eee4
 
 
 
b2852e5
2711c10
 
735eee4
 
 
b2852e5
 
 
 
 
735eee4
 
 
b2852e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735eee4
b2852e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735eee4
2711c10
 
 
735eee4
 
2711c10
 
 
 
 
 
 
 
 
209569b
2711c10
 
 
b2852e5
2711c10
 
 
 
 
 
 
 
 
 
b2852e5
2711c10
 
 
 
 
 
b2852e5
209569b
2711c10
b2852e5
ef563ef
2711c10
 
 
 
 
 
 
 
 
209569b
2711c10
b2852e5
735eee4
2711c10
735eee4
1200cd9
2711c10
735eee4
 
 
 
 
 
 
 
 
0b59186
1200cd9
 
735eee4
1200cd9
735eee4
b2852e5
 
735eee4
1200cd9
2711c10
 
 
 
 
 
 
 
1200cd9
 
2711c10
 
 
 
 
 
 
 
 
 
1200cd9
2711c10
 
b2852e5
1200cd9
735eee4
1200cd9
735eee4
 
 
1200cd9
735eee4
 
1200cd9
735eee4
1200cd9
735eee4
 
 
1200cd9
 
2711c10
 
735eee4
 
b2852e5
735eee4
 
b2852e5
735eee4
 
 
 
1200cd9
 
 
735eee4
2711c10
735eee4
 
 
 
 
 
 
 
 
 
 
 
 
26fecd2
735eee4
 
26fecd2
 
 
735eee4
18903e7
26fecd2
2711c10
 
 
 
 
18903e7
1200cd9
2711c10
1200cd9
2711c10
1200cd9
735eee4
2711c10
 
 
 
1200cd9
18903e7
 
ef563ef
5024d94
735eee4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import gradio as gr
from transformers import pipeline
import librosa
import numpy as np

# ---------------------------------------------------------
# 1. AUDIO-MODELL LADEN (für genaue Erkennung)
# ---------------------------------------------------------

classifier = pipeline(
    task="zero-shot-audio-classification",
    model="laion/clap-htsat-unfused"
)

CANDIDATE_LABELS = [
    "dog barking",
    "dog growling",
    "people talking",
    "traffic noise",
    "car passing",
    "bird singing",
    "music",
    "silence",
]

# ---------------------------------------------------------
# 2. FESTE PARAMETER FÜR ENERGIE-ANALYSE
# ---------------------------------------------------------

ENERGY_FRAME_MS = 25        # Frame-Länge für Energie (ms)
ENERGY_HOP_MS = 10          # Schrittweite (ms)
ENERGY_QUANTILE = 0.80      # Lautheitsschwelle (oberes 20%-Quantil)
MIN_EVENT_DURATION = 0.25   # min. Dauer eines lauten Events (Sek.)

MIN_SEGMENT_FOR_CLAP = 0.15  # minimale Segmentlänge, damit CLAP Sinn macht (Sek.)

# ---------------------------------------------------------
# 3. FUNKTION: LAUTE EVENTS FINDEN
# ---------------------------------------------------------

def find_loud_events(y, sr):
    frame_length = int(sr * ENERGY_FRAME_MS / 1000)
    hop_length = int(sr * ENERGY_HOP_MS / 1000)

    frame_length = max(frame_length, 512)
    hop_length = max(hop_length, 128)

    rms = librosa.feature.rms(
        y=y,
        frame_length=frame_length,
        hop_length=hop_length
    )[0]

    times = librosa.frames_to_time(
        np.arange(len(rms)),
        sr=sr,
        hop_length=hop_length
    )

    thr = np.quantile(rms, ENERGY_QUANTILE)
    mask = rms > thr

    events = []
    in_event = False
    start_t = 0.0

    for i, is_loud in enumerate(mask):
        t = times[i]

        if is_loud and not in_event:
            in_event = True
            start_t = t
        elif not is_loud and in_event:
            end_t = t
            if end_t - start_t >= MIN_EVENT_DURATION:
                events.append((start_t, end_t))
            in_event = False

    if in_event:
        end_t = times[-1]
        if end_t - start_t >= MIN_EVENT_DURATION:
            events.append((start_t, end_t))

    return events

# ---------------------------------------------------------
# 4. BELL-SEGMENTE ERKENNEN
#    - entweder „fast mode“ (nur Energie)
#    - oder CLAP-basiert, aber gebatcht
# ---------------------------------------------------------

def detect_bark_windows(y, sr, loud_events, bark_prob_threshold, use_clap):
    """
    Gibt eine Liste von (start_s, end_s) zurück, die als Bellen gewertet werden.
    Wenn use_clap=False: jedes laute Event = Bellen (reine Lautstärke-Logik).
    Wenn use_clap=True: CLAP bewertet die Events.
    """
    if not use_clap:
        # Fast Mode: alles, was laut ist, wird als Bellen gezählt
        return [(s, e, 1.0) for (s, e) in loud_events]

    # CLAP-Mode: wir batchen die Events für die Pipeline
    segments = []
    meta = []  # (start_s, end_s) zu jedem Segment

    for (s, e) in loud_events:
        if e - s < MIN_SEGMENT_FOR_CLAP:
            continue
        start_idx = int(s * sr)
        end_idx = int(e * sr)
        seg = y[start_idx:end_idx]
        if len(seg) == 0:
            continue
        segments.append(seg)
        meta.append((s, e))

    if not segments:
        return []

    # Batch-Aufruf der Pipeline (deutlich schneller als Einzel-Aufrufe)
    results_list = classifier(
        segments,
        candidate_labels=CANDIDATE_LABELS,
        multi_label=True,
        batch_size=4,
    )

    bark_windows = []
    for (s, e), results in zip(meta, results_list):
        bark_score = 0.0
        for r in results:
            if r["label"].lower() == "dog barking":
                bark_score = float(r["score"])
                break
        if bark_score >= bark_prob_threshold:
            bark_windows.append((s, e, bark_score))

    return bark_windows

# ---------------------------------------------------------
# 5. HAUPT-ANALYSEFUNKTION
# ---------------------------------------------------------

def analyze_barking(audio_path, max_pause_sec, bark_prob_threshold, fast_mode):
    # 0. Upload prüfen
    if audio_path is None or audio_path == "":
        return "Es wurde keine Audiodatei hochgeladen."

    # 1. Audio laden
    try:
        y, sr = librosa.load(audio_path, sr=16000, mono=True)
    except Exception as e:
        return f"Fehler beim Laden der Audiodatei: {e}"

    duration = len(y) / sr
    if duration == 0:
        return "Die Audiodatei ist leer."

    # 2. Laute Ereignisse finden
    loud_events = find_loud_events(y, sr)
    if not loud_events:
        return "Keine lauten Ereignisse gefunden – vermutlich kein Bellen."

    # 3. Bellen-Segmente finden
    bark_windows = detect_bark_windows(
        y,
        sr,
        loud_events,
        bark_prob_threshold,
        use_clap=not fast_mode
    )

    if not bark_windows:
        if fast_mode:
            return (
                "Fast Mode (ohne KI): keine ausreichend lauten Ereignisse, "
                "die als Bellen interpretiert wurden."
            )
        else:
            return (
                "Es wurde kein Hundebellen mit ausreichend hoher Sicherheit erkannt.\n\n"
                f"(Schwellwert für 'dog barking' = {bark_prob_threshold:.2f})"
            )

    # 4. Bell-Segmente zu Episoden zusammenfassen
    bark_windows.sort(key=lambda x: x[0])  # nach Startzeit

    episodes = []
    cur_start, cur_end, _ = bark_windows[0]

    for s, e, _ in bark_windows[1:]:
        if s - cur_end <= max_pause_sec:
            cur_end = max(cur_end, e)
        else:
            episodes.append((cur_start, cur_end))
            cur_start, cur_end = s, e

    episodes.append((cur_start, cur_end))

    # 5. Kennzahlen
    count = len(episodes)
    total_seconds = sum(e2 - e1 for (e1, e2) in episodes)

    lines = []
    mode_text = "Fast Mode (nur Energie)" if fast_mode else "CLAP-KI-Modus"
    lines.append(f"**Modus:** {mode_text}")
    lines.append(f"**A: Anzahl der Bell-Ereignisse:** {count}")
    lines.append(f"**B: Gesamtdauer des Bellens:** {total_seconds:.1f} Sekunden\n")
    lines.append(
        f"_Regel_: > {max_pause_sec:.1f} Sekunden Pause = neues Ereignis\n"
        f"_Schwellwert 'dog barking'_: {bark_prob_threshold:.2f}\n"
    )
    lines.append("**Details:**")
    for i, (s, e) in enumerate(episodes, start=1):
        dur = e - s
        lines.append(f"- Ereignis {i}: {s:.1f}s bis {e:.1f}s — Dauer: {dur:.1f}s")

    return "\n".join(lines)

# ---------------------------------------------------------
# 6. GRADIO UI – MIT SLIDERN & FAST-MODE
# ---------------------------------------------------------

audio_input = gr.Audio(type="filepath", label="Audio hochladen (.wav, .mp3)")

pause_slider = gr.Slider(
    minimum=1.0,
    maximum=10.0,
    value=3.0,
    step=0.5,
    label="Maximale Pause zwischen Bellen (Sekunden)",
)

threshold_slider = gr.Slider(
    minimum=0.000001,        # 1e-6
    maximum=0.9,
    value=0.35,
    step=0.000001,
    label="Schwellwert für 'dog barking' (0.000001–1)",
    info="Je kleiner, desto empfindlicher (erkennt sehr leises/fernes Bellen)."
)


fast_checkbox = gr.Checkbox(
    value=False,
    label="Fast Mode (nur Lautstärke, ohne KI-Modell – sehr schnell, aber ungenauer)",
)

demo = gr.Interface(
    fn=analyze_barking,
    inputs=[audio_input, pause_slider, threshold_slider, fast_checkbox],
    outputs=gr.Markdown(),
    title="Barking Episode Analyzer (mit Parametern & Fast Mode)",
    description=(
        "Erkennt Hundebellen in Aufnahmen.\n\n"
        "Optionen:\n"
        "- **Maximale Pause**: ab welcher Pause ein neues Bell-Ereignis gezählt wird.\n"
        "- **Schwellwert**: ab welcher Wahrscheinlichkeit 'dog barking' gezählt wird.\n"
        "- **Fast Mode**: nur Lautstärke-Analyse (schnell), ohne 'dog barking'-Modell."
    ),
)

if __name__ == "__main__":
    demo.launch()