Spaces:

Xenova
/

sponsorblock-ml

Running

App Files Files Community

Joshua Lochner commited on Feb 16, 2022

Commit

cfbd4d5

1 Parent(s): de9c8c4

Update preprocessing script to use logging module

Browse files

Files changed (1) hide show

src/preprocess.py +28 -27

src/preprocess.py CHANGED Viewed

@@ -20,6 +20,9 @@ import time
 import requests
 PROFANITY_RAW = '[ __ ]'  # How YouTube transcribes profanity
 PROFANITY_CONVERTED = '*****'  # Safer version for tokenizing
@@ -204,7 +207,7 @@ def get_words(video_id, process=True, transcript_type='auto', fallback='manual',
         pass  # Mark as empty transcript
     except json.decoder.JSONDecodeError:
-        print('JSONDecodeError for', video_id)
         if os.path.exists(transcript_path):
             os.remove(transcript_path)  # Remove file and try again
         return get_words(video_id, process, transcript_type, fallback, granularity)
@@ -543,12 +546,12 @@ def main():
         preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
     if preprocess_args.update_database:
-        print('Updating database')
         for mirror in MIRRORS:
-            print('Downloading from', mirror)
             if download_file(mirror, raw_dataset_path):
                 break
-            print('Failed, trying next')
     os.makedirs(dataset_args.data_dir, exist_ok=True)
     processed_db_path = os.path.join(
@@ -558,11 +561,10 @@ def main():
     @lru_cache(maxsize=1)
     def read_db():
         if not preprocess_args.overwrite and os.path.exists(processed_db_path):
-            print(
-                'Using cached processed database (use `--overwrite` to avoid this behaviour).')
             with open(processed_db_path) as fp:
                 return json.load(fp)
-        print('Processing raw database')
         db = {}
         allowed_categories = list(map(str.lower, CATGEGORY_OPTIONS))
@@ -618,7 +620,7 @@ def main():
         # Remove duplicate sponsor segments by choosing best (most votes)
         if not preprocess_args.keep_duplicate_segments:
-            print('Remove duplicate segments')
             for key in db:
                 db[key] = remove_duplicate_segments(db[key])
@@ -646,7 +648,7 @@ def main():
             # TODO remove videos that contain a full-video label?
-        print('Saved', len(db), 'videos')
         with open(processed_db_path, 'w') as fp:
             json.dump(db, fp)
@@ -660,7 +662,7 @@ def main():
     # 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
     # 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
     if preprocess_args.do_transcribe:
-        print('Collecting videos')
         parsed_database = read_db()
         # Remove transcripts already processed
@@ -678,7 +680,7 @@ def main():
             get_words(video_id)
             return video_id
-        print('Setting up ThreadPoolExecutor')
         with concurrent.futures.ThreadPoolExecutor(max_workers=preprocess_args.num_jobs) as pool, \
                 tqdm(total=len(video_ids)) as progress:
@@ -698,21 +700,21 @@ def main():
                         progress.update()
             except KeyboardInterrupt:
-                print('Gracefully shutting down: Cancelling unscheduled tasks')
                 # only futures that are not done will prevent exiting
                 for future in to_process:
                     future.cancel()
-                print('Waiting for in-progress tasks to complete')
                 concurrent.futures.wait(to_process, timeout=None)
-                print('Cancellation successful')
     final_path = os.path.join(
         dataset_args.data_dir, dataset_args.processed_file)
     if preprocess_args.do_create:
-        print('Create final data')
         final_data = {}
@@ -786,7 +788,7 @@ def main():
         dataset_args.data_dir, dataset_args.negative_file)
     if preprocess_args.do_generate:
-        print('Generating')
         # max_videos=preprocess_args.max_videos,
         # max_segments=preprocess_args.max_segments,
         # , max_videos, max_segments
@@ -868,8 +870,8 @@ def main():
                         print(json.dumps(d), file=negative)
     if preprocess_args.do_split:
-        print('Splitting')
-        print('Read files')
         with open(positive_file, encoding='utf-8') as positive:
             sponsors = positive.readlines()
@@ -877,11 +879,11 @@ def main():
         with open(negative_file, encoding='utf-8') as negative:
             non_sponsors = negative.readlines()
-        print('Shuffle')
         random.shuffle(sponsors)
         random.shuffle(non_sponsors)
-        print('Calculate ratios')
         # Ensure correct ratio of positive to negative segments
         percentage_negative = 1 - preprocess_args.percentage_positive
@@ -901,12 +903,12 @@ def main():
             excess = non_sponsors[z:]
             non_sponsors = non_sponsors[:z]
-        print('Join')
         all_labelled_segments = sponsors + non_sponsors
         random.shuffle(all_labelled_segments)
-        print('Split')
         ratios = [preprocess_args.train_split,
                   preprocess_args.test_split,
                   preprocess_args.valid_split]
@@ -927,9 +929,9 @@ def main():
                 with open(outfile, 'w', encoding='utf-8') as fp:
                     fp.writelines(items)
             else:
-                print('Skipping', name)
-        print('Write')
         # Save excess items
         excess_path = os.path.join(
             dataset_args.data_dir, dataset_args.excess_file)
@@ -937,10 +939,9 @@ def main():
             with open(excess_path, 'w', encoding='utf-8') as fp:
                 fp.writelines(excess)
         else:
-            print('Skipping', dataset_args.excess_file)
-        print('Finished splitting:', len(sponsors),
-              'sponsors,', len(non_sponsors), 'non sponsors')
 def split(arr, ratios):

 import requests
+logger = logging.getLogger(__name__)
 PROFANITY_RAW = '[ __ ]'  # How YouTube transcribes profanity
 PROFANITY_CONVERTED = '*****'  # Safer version for tokenizing
         pass  # Mark as empty transcript
     except json.decoder.JSONDecodeError:
+        logger.warning(f'JSONDecodeError for {video_id}')
         if os.path.exists(transcript_path):
             os.remove(transcript_path)  # Remove file and try again
         return get_words(video_id, process, transcript_type, fallback, granularity)
         preprocess_args.raw_data_dir, preprocess_args.raw_data_file)
     if preprocess_args.update_database:
+        logger.info('Updating database')
         for mirror in MIRRORS:
+            logger.info(f'Downloading from {mirror}')
             if download_file(mirror, raw_dataset_path):
                 break
+            logger.warning('Failed, trying next')
     os.makedirs(dataset_args.data_dir, exist_ok=True)
     processed_db_path = os.path.join(
     @lru_cache(maxsize=1)
     def read_db():
         if not preprocess_args.overwrite and os.path.exists(processed_db_path):
+            logger.info('Using cached processed database (use `--overwrite` to avoid this behaviour).')
             with open(processed_db_path) as fp:
                 return json.load(fp)
+        logger.info('Processing raw database')
         db = {}
         allowed_categories = list(map(str.lower, CATGEGORY_OPTIONS))
         # Remove duplicate sponsor segments by choosing best (most votes)
         if not preprocess_args.keep_duplicate_segments:
+            logger.info('Remove duplicate segments')
             for key in db:
                 db[key] = remove_duplicate_segments(db[key])
             # TODO remove videos that contain a full-video label?
+        logger.info(f'Saved {len(db)} videos')
         with open(processed_db_path, 'w') as fp:
             json.dump(db, fp)
     # 'userID', 'timeSubmitted', 'views', 'category', 'actionType', 'service', 'videoDuration',
     # 'hidden', 'reputation', 'shadowHidden', 'hashedVideoID', 'userAgent', 'description'
     if preprocess_args.do_transcribe:
+        logger.info('Collecting videos')
         parsed_database = read_db()
         # Remove transcripts already processed
             get_words(video_id)
             return video_id
+        logger.info('Setting up ThreadPoolExecutor')
         with concurrent.futures.ThreadPoolExecutor(max_workers=preprocess_args.num_jobs) as pool, \
                 tqdm(total=len(video_ids)) as progress:
                         progress.update()
             except KeyboardInterrupt:
+                logger.info('Gracefully shutting down: Cancelling unscheduled tasks')
                 # only futures that are not done will prevent exiting
                 for future in to_process:
                     future.cancel()
+                logger.info('Waiting for in-progress tasks to complete')
                 concurrent.futures.wait(to_process, timeout=None)
+                logger.info('Cancellation successful')
     final_path = os.path.join(
         dataset_args.data_dir, dataset_args.processed_file)
     if preprocess_args.do_create:
+        logger.info('Create final data')
         final_data = {}
         dataset_args.data_dir, dataset_args.negative_file)
     if preprocess_args.do_generate:
+        logger.info('Generating')
         # max_videos=preprocess_args.max_videos,
         # max_segments=preprocess_args.max_segments,
         # , max_videos, max_segments
                         print(json.dumps(d), file=negative)
     if preprocess_args.do_split:
+        logger.info('Splitting')
+        logger.info('Read files')
         with open(positive_file, encoding='utf-8') as positive:
             sponsors = positive.readlines()
         with open(negative_file, encoding='utf-8') as negative:
             non_sponsors = negative.readlines()
+        logger.info('Shuffle')
         random.shuffle(sponsors)
         random.shuffle(non_sponsors)
+        logger.info('Calculate ratios')
         # Ensure correct ratio of positive to negative segments
         percentage_negative = 1 - preprocess_args.percentage_positive
             excess = non_sponsors[z:]
             non_sponsors = non_sponsors[:z]
+        logger.info('Join')
         all_labelled_segments = sponsors + non_sponsors
         random.shuffle(all_labelled_segments)
+        logger.info('Split')
         ratios = [preprocess_args.train_split,
                   preprocess_args.test_split,
                   preprocess_args.valid_split]
                 with open(outfile, 'w', encoding='utf-8') as fp:
                     fp.writelines(items)
             else:
+                logger.info(f'Skipping {name}')
+        logger.info('Write')
         # Save excess items
         excess_path = os.path.join(
             dataset_args.data_dir, dataset_args.excess_file)
             with open(excess_path, 'w', encoding='utf-8') as fp:
                 fp.writelines(excess)
         else:
+            logger.info(f'Skipping {dataset_args.excess_file}')
+        logger.info(f'Finished splitting: {len(sponsors)} sponsors, {len(non_sponsors)} non sponsors')
 def split(arr, ratios):