Spaces:
Running
Running
Joshua Lochner
commited on
Commit
·
2782b0c
1
Parent(s):
183ba5e
Fix the reduction of overlapping segments
Browse files- src/preprocess.py +14 -16
src/preprocess.py
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
-
from
|
| 2 |
-
from utils import jaccard
|
| 3 |
-
from shared import START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE
|
| 4 |
from functools import lru_cache
|
| 5 |
from datetime import datetime
|
| 6 |
import itertools
|
|
@@ -11,18 +9,16 @@ import segment
|
|
| 11 |
from tqdm import tqdm
|
| 12 |
from dataclasses import dataclass, field
|
| 13 |
from transformers import HfArgumentParser
|
| 14 |
-
from shared import GeneralArguments, CustomTokens
|
| 15 |
import csv
|
| 16 |
import re
|
| 17 |
import random
|
| 18 |
import logging
|
| 19 |
-
from youtube_transcript_api import YouTubeTranscriptApi
|
| 20 |
-
from youtube_transcript_api._errors import CouldNotRetrieveTranscript, YouTubeRequestFailed, TooManyRequests
|
| 21 |
import os
|
| 22 |
import json
|
| 23 |
import time
|
| 24 |
import requests
|
| 25 |
-
from utils import Task, InterruptibleTaskPool
|
| 26 |
|
| 27 |
|
| 28 |
def find(s, ch):
|
|
@@ -264,6 +260,9 @@ def remove_duplicate_segments(segments):
|
|
| 264 |
if best_similar_seg not in best:
|
| 265 |
best.append(best_similar_seg)
|
| 266 |
|
|
|
|
|
|
|
|
|
|
| 267 |
return best
|
| 268 |
|
| 269 |
|
|
@@ -501,6 +500,7 @@ def main():
|
|
| 501 |
processed_db_path = os.path.join(
|
| 502 |
dataset_args.data_dir, dataset_args.processed_database)
|
| 503 |
|
|
|
|
| 504 |
def read_db():
|
| 505 |
if not preprocess_args.overwrite and os.path.exists(processed_db_path):
|
| 506 |
with open(processed_db_path) as fp:
|
|
@@ -558,6 +558,11 @@ def main():
|
|
| 558 |
# 'action': line['actionType'],
|
| 559 |
})
|
| 560 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
# We now remove whole videos from the list
|
| 562 |
# Helps with obtaining "fully-labelled" videos
|
| 563 |
min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
|
|
@@ -580,14 +585,7 @@ def main():
|
|
| 580 |
# Always include segments locked by VIPs, regardless of view count
|
| 581 |
del db[key]
|
| 582 |
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
# Remove duplicate sponsor segments by choosing best (most votes)
|
| 586 |
-
print('Remove duplicate segments')
|
| 587 |
-
for key in db:
|
| 588 |
-
db[key] = remove_duplicate_segments(db[key])
|
| 589 |
-
num_segments += len(db[key])
|
| 590 |
-
print('Saved', len(db), 'videos and', num_segments, 'segments')
|
| 591 |
|
| 592 |
with open(processed_db_path, 'w') as fp:
|
| 593 |
json.dump(db, fp)
|
|
@@ -613,7 +611,7 @@ def main():
|
|
| 613 |
for video_id in video_ids
|
| 614 |
)
|
| 615 |
|
| 616 |
-
print('
|
| 617 |
with tqdm(total=len(video_ids)) as progress:
|
| 618 |
def callback(task):
|
| 619 |
progress.set_description(f'Processing {task.args[0]}')
|
|
|
|
| 1 |
+
from utils import jaccard, Task, InterruptibleTaskPool
|
|
|
|
|
|
|
| 2 |
from functools import lru_cache
|
| 3 |
from datetime import datetime
|
| 4 |
import itertools
|
|
|
|
| 9 |
from tqdm import tqdm
|
| 10 |
from dataclasses import dataclass, field
|
| 11 |
from transformers import HfArgumentParser
|
| 12 |
+
from shared import CATGEGORY_OPTIONS, START_SEGMENT_TEMPLATE, END_SEGMENT_TEMPLATE, GeneralArguments, CustomTokens
|
| 13 |
import csv
|
| 14 |
import re
|
| 15 |
import random
|
| 16 |
import logging
|
| 17 |
+
from youtube_transcript_api import YouTubeTranscriptApi, CouldNotRetrieveTranscript, YouTubeRequestFailed, TooManyRequests
|
|
|
|
| 18 |
import os
|
| 19 |
import json
|
| 20 |
import time
|
| 21 |
import requests
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
def find(s, ch):
|
|
|
|
| 260 |
if best_similar_seg not in best:
|
| 261 |
best.append(best_similar_seg)
|
| 262 |
|
| 263 |
+
if len(segments) != len(best): # Saw some reduction... try again
|
| 264 |
+
return remove_duplicate_segments(best)
|
| 265 |
+
|
| 266 |
return best
|
| 267 |
|
| 268 |
|
|
|
|
| 500 |
processed_db_path = os.path.join(
|
| 501 |
dataset_args.data_dir, dataset_args.processed_database)
|
| 502 |
|
| 503 |
+
@lru_cache(maxsize=1)
|
| 504 |
def read_db():
|
| 505 |
if not preprocess_args.overwrite and os.path.exists(processed_db_path):
|
| 506 |
with open(processed_db_path) as fp:
|
|
|
|
| 558 |
# 'action': line['actionType'],
|
| 559 |
})
|
| 560 |
|
| 561 |
+
# Remove duplicate sponsor segments by choosing best (most votes)
|
| 562 |
+
print('Remove duplicate segments')
|
| 563 |
+
for key in db:
|
| 564 |
+
db[key] = remove_duplicate_segments(db[key])
|
| 565 |
+
|
| 566 |
# We now remove whole videos from the list
|
| 567 |
# Helps with obtaining "fully-labelled" videos
|
| 568 |
min_date = datetime.strptime(preprocess_args.min_date, '%d/%m/%Y')
|
|
|
|
| 585 |
# Always include segments locked by VIPs, regardless of view count
|
| 586 |
del db[key]
|
| 587 |
|
| 588 |
+
print('Saved', len(db), 'videos')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
|
| 590 |
with open(processed_db_path, 'w') as fp:
|
| 591 |
json.dump(db, fp)
|
|
|
|
| 611 |
for video_id in video_ids
|
| 612 |
)
|
| 613 |
|
| 614 |
+
print('Downloading transcripts')
|
| 615 |
with tqdm(total=len(video_ids)) as progress:
|
| 616 |
def callback(task):
|
| 617 |
progress.set_description(f'Processing {task.args[0]}')
|