create-caption

Paused

App Files Files Community

nroggendorff commited on Nov 17

Commit

73418cf

verified ·

1 Parent(s): 84f4c93

Update train.py

Browse files

Files changed (1) hide show

train.py +45 -36

train.py CHANGED Viewed

@@ -67,9 +67,15 @@ def preprocess_example_batch(examples, text):
     }
-def run_preprocessing(input_dataset, output_dir, num_proc=32, batch_size=100):
     print("Loading dataset for preprocessing...")
     ds = datasets.load_dataset(input_dataset, split="train")
     print("Loading processor...")
     processor = AutoProcessor.from_pretrained("datalab-to/chandra")
@@ -99,7 +105,7 @@ def caption_batch(batch, processor, model):
         k: v.pin_memory().to(model.device, non_blocking=True) for k, v in inputs.items()
     }
-    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):  # type: ignore
         generated = model.generate(
             **inputs,
             max_new_tokens=128,
@@ -160,36 +166,34 @@ def process_shard(
 def main():
     mp.set_start_method("spawn", force=True)
     input_dataset = "none-yet/anime-captions"
-    preprocessed_dataset = "temp_preprocessed"
     output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
     batch_size = 20
-    init_flag = os.environ.get("INIT", "0")
-    is_first_run = init_flag == "0"
-    is_second_run = init_flag == "1"
     if not os.path.exists(preprocessed_dataset):
-        print(f"[{'First' if is_first_run else 'Second'} Run] Running preprocessing...")
-        ds_full = datasets.load_dataset(input_dataset, split="train")
-        total_size = len(ds_full)
-        midpoint = total_size // 2
-        if is_first_run:
-            ds_to_process = ds_full.select(range(0, midpoint))
-        else:
-            ds_to_process = ds_full.select(range(midpoint, total_size))
-        print(
-            f"[{'First' if is_first_run else 'Second'} Run] Saving selected shard to disk..."
-        )
-        ds_to_process.save_to_disk("temp_input_shard")
-        run_preprocessing("temp_input_shard", preprocessed_dataset)
-        # Clean up temp input shard
-        shutil.rmtree("temp_input_shard")
     print("Loading preprocessed dataset...")
     ds = datasets.load_from_disk(preprocessed_dataset)
@@ -207,7 +211,7 @@ def main():
     for i in range(num_gpus):
         start = i * shard_size
         end = start + shard_size if i < num_gpus - 1 else total_size
-        output_file = f"temp_shard_{i}"
         temp_files.append(output_file)
         p = mp.Process(
@@ -242,16 +246,21 @@ def main():
     shards = [cast(Dataset, datasets.load_from_disk(f)) for f in temp_files]
     final_ds = datasets.concatenate_datasets(shards)
-    if is_first_run:
-        print("First run: pushing first half to hub...")
-        final_ds.push_to_hub(output_dataset, create_pr=False)
     else:
-        print("Second run: loading first half and merging...")
-        first_half_ds = datasets.load_dataset(output_dataset, split="train")
-        merged_ds = datasets.concatenate_datasets([first_half_ds, final_ds])
-        print(f"Final merged dataset size: {len(merged_ds)}")
-        print("Pushing full dataset with create_pr=True...")
-        merged_ds.push_to_hub(output_dataset, create_pr=True)
     print("Cleaning up temporary files...")
     for f in temp_files:
@@ -264,4 +273,4 @@ def main():
 if __name__ == "__main__":
-    main()

     }
+def run_preprocessing(input_dataset, output_dir, num_proc=32, batch_size=100, start_idx=0, end_idx=None):
     print("Loading dataset for preprocessing...")
     ds = datasets.load_dataset(input_dataset, split="train")
+    if end_idx is None:
+        end_idx = len(ds)
+    print(f"Selecting range [{start_idx}:{end_idx}]...")
+    ds = ds.select(range(start_idx, end_idx))
     print("Loading processor...")
     processor = AutoProcessor.from_pretrained("datalab-to/chandra")
         k: v.pin_memory().to(model.device, non_blocking=True) for k, v in inputs.items()
     }
+    with torch.no_grad(), torch.amp.autocast("cuda", dtype=torch.bfloat16):
         generated = model.generate(
             **inputs,
             max_new_tokens=128,
 def main():
     mp.set_start_method("spawn", force=True)
+    init_stage = os.environ.get("INIT", "0")
     input_dataset = "none-yet/anime-captions"
     output_dataset = "nroggendorff/anime-captions"
     model_name = "datalab-to/chandra"
     batch_size = 20
+    print(f"Running stage INIT={init_stage}")
+    full_ds = datasets.load_dataset(input_dataset, split="train")
+    total_dataset_size = len(full_ds)
+    midpoint = total_dataset_size // 2
+    if init_stage == "0":
+        print(f"Stage 0: Processing first half [0:{midpoint}]")
+        preprocessed_dataset = "temp_preprocessed_0"
+        start_idx = 0
+        end_idx = midpoint
+        final_output = f"{output_dataset}_part0"
+    else:
+        print(f"Stage 1: Processing second half [{midpoint}:{total_dataset_size}]")
+        preprocessed_dataset = "temp_preprocessed_1"
+        start_idx = midpoint
+        end_idx = total_dataset_size
+        final_output = input_dataset
     if not os.path.exists(preprocessed_dataset):
+        run_preprocessing(input_dataset, preprocessed_dataset, start_idx=start_idx, end_idx=end_idx)
     print("Loading preprocessed dataset...")
     ds = datasets.load_from_disk(preprocessed_dataset)
     for i in range(num_gpus):
         start = i * shard_size
         end = start + shard_size if i < num_gpus - 1 else total_size
+        output_file = f"temp_shard_{init_stage}_{i}"
         temp_files.append(output_file)
         p = mp.Process(
     shards = [cast(Dataset, datasets.load_from_disk(f)) for f in temp_files]
     final_ds = datasets.concatenate_datasets(shards)
+    print(f"Final dataset size: {len(final_ds)}")
+    if init_stage == "0":
+        print(f"Pushing first half to {final_output}...")
+        final_ds.push_to_hub(final_output, create_pr=False)
     else:
+        print("Loading first half from hub...")
+        first_half = datasets.load_dataset(f"{output_dataset}_part0", split="train")
+        print("Concatenating both halves...")
+        complete_ds = datasets.concatenate_datasets([first_half, final_ds])
+        print(f"Complete dataset size: {len(complete_ds)}")
+        print(f"Pushing complete dataset to {final_output} with PR...")
+        complete_ds.push_to_hub(final_output, create_pr=True)
     print("Cleaning up temporary files...")
     for f in temp_files:
 if __name__ == "__main__":
+    main()