Spaces:

sander-wood
/

clamp3

Runtime error

App Files Files Community

sander-wood commited on Feb 12

Commit

ad822ab

verified ·

1 Parent(s): 20c1bbc

Upload 8 files

Browse files

Files changed (8) hide show

README.md +3 -3
app.py +304 -0
config.py +79 -0
extract_clamp3.py +189 -0
features.zip +3 -0
requirements.txt +72 -0
utils.py +574 -0
wikimt-x-public.jsonl +0 -0

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Clamp3
-emoji: ⚡
-colorFrom: gray
-colorTo: pink
 sdk: gradio
 sdk_version: 5.16.0
 app_file: app.py

 ---
 title: Clamp3
+emoji: 🗜️
+colorFrom: pink
+colorTo: yellow
 sdk: gradio
 sdk_version: 5.16.0
 app_file: app.py

app.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import os
+import torch
+import numpy as np
+import gradio as gr
+import zipfile
+import json
+import requests
+import subprocess
+import shutil
+from transformers import BlipProcessor, BlipForConditionalGeneration
+title = "# 🗜️ CLaMP 3 - Multimodal & Multilingual Semantic Music Search"
+badges = """
+    <div style="text-align: center;">
+        <a href="#"><img src="https://img.shields.io/badge/CLaMP%203%20Homepage-Coming%20Soon-lightgrey?style=for-the-badge&logo=home-assistant" alt="Homepage"></a>
+        <a href="#"><img src="https://img.shields.io/badge/CLaMP%203%20Paper-Coming%20Soon-lightgrey?style=for-the-badge&logo=arxiv" alt="Paper"></a>
+        <a href="https://github.com/sanderwood/clamp3"><img src="https://img.shields.io/badge/CLaMP%203%20Code-GitHub-181717?style=for-the-badge&logo=github" alt="GitHub"></a>
+        <a href="https://huggingface.co/sander-wood/clamp3/tree/main"><img src="https://img.shields.io/badge/Model%20Weights-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Model Weights"></a>
+        <a href="https://huggingface.co/datasets/sander-wood/m4-rag"><img src="https://img.shields.io/badge/M4--RAG%20Pretraining%20Dataset-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Dataset"></a>
+        <a href="https://huggingface.co/datasets/sander-wood/wikimt-x"><img src="https://img.shields.io/badge/WikiMT--X%20Evaluation%20Benchmark-Hugging%20Face-ffcc00?style=for-the-badge&logo=huggingface" alt="Benchmark"></a>
+    </div>
+    <style>
+        div a {
+            display: inline-block;
+            margin: 5px;
+        }
+        div a img {
+            height: 30px;
+        }
+    </style>
+"""
+description = """CLaMP 3 is a **multimodal and multilingual** music information retrieval (MIR) framework, supporting **sheet music, audio, and performance signals** in over **100 languages**. Using **contrastive learning**, it aligns these modalities in a shared space for **cross-modal retrieval**.
+### 🔍 **How This Demo Works**
+- You can **retrieve music using any text input (in any language) or an image** (`.png`, `.jpg`).
+- When using an image, **BLIP** generates a caption, which is then used for retrieval.
+- Since CLaMP 3's training data includes **rich visual descriptions of musical scenes**, it can **match images to semantically relevant music**.
+### ⚠️ **Limitations**
+- This demo retrieves music **only from the WikiMT-X benchmark (1,000 pieces)**.
+- These pieces are **mainly from the U.S. and Western Europe (especially the U.S.)** and **mostly from the 20th century**.
+- The retrieval results are **mostly limited to Western 20th-century music**, so you **won’t** find music from **other regions or historical periods**.
+- If you need retrieval for a **different music collection**, deploy **CLaMP 3 on your own dataset**.
+This demo is for **research purposes only**."""
+# Load BLIP image captioning model and processor
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+# Download weight file if it does not exist
+weights_url = "https://huggingface.co/sander-wood/clamp3/resolve/main/weights_clamp3_saas_h_size_768_t_model_FacebookAI_xlm-roberta-base_t_length_128_a_size_768_a_layers_12_a_length_128_s_size_768_s_layers_12_p_size_64_p_length_512.pth"
+weights_filename = "weights_clamp3_saas_h_size_768_t_model_FacebookAI_xlm-roberta-base_t_length_128_a_size_768_a_layers_12_a_length_128_s_size_768_s_layers_12_p_size_64_p_length_512.pth"
+if not os.path.exists(weights_filename):
+    print("Downloading weights file...")
+    response = requests.get(weights_url, stream=True)
+    response.raise_for_status()
+    with open(weights_filename, "wb") as f:
+        for chunk in response.iter_content(chunk_size=8192):
+            if chunk:
+                f.write(chunk)
+    print("Weights file downloaded.")
+ZIP_PATH = "features.zip"
+if os.path.exists(ZIP_PATH):
+    print(f"Extracting {ZIP_PATH}...")
+    with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
+        zip_ref.extractall(".")
+    print("Extraction complete.")
+# Load metadata
+metadata_map = {}
+METADATA_FILE = "wikimt-x-public.jsonl"
+if os.path.exists(METADATA_FILE):
+    with open(METADATA_FILE, "r", encoding="utf-8") as f:
+        for line in f:
+            data = json.loads(line)
+            metadata_map[data["id"]] = data
+else:
+    print(f"Warning: {METADATA_FILE} not found.")
+features_cache = {}
+def get_info(folder_path):
+    """
+    Load all .npy files from the specified folder and return a dictionary
+    with the file names (without extension) as keys.
+    """
+    if folder_path in features_cache:
+        return features_cache[folder_path]
+    if not os.path.exists(folder_path):
+        return {}
+    files = sorted(os.listdir(folder_path))
+    features = {}
+    for file in files:
+        if file.endswith(".npy"):
+            key = file.split(".")[0]
+            try:
+                features[key] = np.load(os.path.join(folder_path, file))[0]
+            except Exception as e:
+                print(f"Error loading {file}: {e}")
+    features_cache[folder_path] = features
+    return features
+def find_top_similar(query_file, reference_folder):
+    """
+    Compare the query feature with all reference features in the specified folder
+    using cosine similarity and return the top 10 candidate results in the format:
+    Title | Artists | sim: SimilarityScore.
+    """
+    top_k = 10
+    try:
+        query_feature = np.load(query_file.name)[0]
+    except Exception as e:
+        return [], f"Error loading query feature: {e}"
+    query_tensor = torch.tensor(query_feature, dtype=torch.float32).unsqueeze(dim=0)
+    key_features = get_info(reference_folder)
+    if not key_features:
+        return [], f"No reference features found in {reference_folder}."
+    ref_keys = list(key_features.keys())
+    ref_array = np.array([key_features[k] for k in ref_keys])
+    key_feats_tensor = torch.tensor(ref_array, dtype=torch.float32)
+    query_tensor_expanded = query_tensor.expand(key_feats_tensor.size(0), -1)
+    similarities = torch.cosine_similarity(query_tensor_expanded, key_feats_tensor, dim=1)
+    ranked_indices = torch.argsort(similarities, descending=True)
+    candidate_ids = []
+    candidate_display = []
+    for i in range(top_k):
+        if i < len(ref_keys):
+            candidate_idx = ranked_indices[i].item()
+            candidate_id = ref_keys[candidate_idx]
+            sim = round(similarities[candidate_idx].item(), 4)
+            meta = metadata_map.get(candidate_id, {})
+            title = meta.get("title", candidate_id)
+            artists = meta.get("artists", "Unknown")
+            if isinstance(artists, list):
+                artists = ", ".join(artists)
+            candidate_ids.append(candidate_id)
+            candidate_display.append(f"{title} | {artists} | sim: {sim}")
+        else:
+            candidate_ids.append("N/A")
+            candidate_display.append("N/A")
+    return candidate_ids, candidate_display
+def show_details(selected_id):
+    """
+    Return detailed metadata and embedded YouTube video HTML based on the candidate ID.
+    """
+    if selected_id == "N/A":
+        return ("", "", "", "", "", "", "", "")
+    data = metadata_map.get(selected_id, {})
+    if not data:
+        return ("No details found", "", "", "", "", "", "", "")
+    title = data.get("title", "")
+    artists = data.get("artists", "")
+    if isinstance(artists, list):
+        artists = ", ".join(artists)
+    genre = data.get("genre", "")
+    background = data.get("background", "")
+    analysis = data.get("analysis", "")
+    description = data.get("description", "")
+    scene = data.get("scene", "")
+    youtube_html = (
+        f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{selected_id}" '
+        f'frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; '
+        f'gyroscope; picture-in-picture" allowfullscreen></iframe>'
+    )
+    return title, artists, genre, background, analysis, description, scene, youtube_html
+def extract_features_from_text(text):
+    """
+    Save the input text to a file, call the CLaMP 3 feature extraction script,
+    and return the generated feature file path.
+    """
+    input_dir = "input_dir"
+    output_dir = "output_dir"
+    os.makedirs(input_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    # Clear input_dir and output_dir
+    for d in [input_dir, output_dir]:
+        for filename in os.listdir(d):
+            file_path = os.path.join(d, filename)
+            if os.path.isfile(file_path) or os.path.islink(file_path):
+                os.unlink(file_path)
+            elif os.path.isdir(file_path):
+                shutil.rmtree(file_path)
+    input_file = os.path.join(input_dir, "input.txt")
+    print("Text input:", text)
+    with open(input_file, "w", encoding="utf-8") as f:
+        f.write(text)
+    command = ["python", "extract_clamp3.py", input_dir, output_dir, "--get_global"]
+    subprocess.run(command, check=True)
+    output_file = os.path.join(output_dir, "input.npy")
+    return output_file
+def generate_caption(image):
+    """
+    Use the BLIP model to generate a descriptive caption for the given image.
+    """
+    inputs = processor(image, return_tensors="pt")
+    outputs = blip_model.generate(**inputs)
+    caption = processor.decode(outputs[0], skip_special_tokens=True)
+    return caption
+class FileWrapper:
+    """
+    Simulate a file object with a .name attribute.
+    """
+    def __init__(self, path):
+        self.name = path
+def search_wrapper(search_mode, text_input, image_input):
+    """
+    Perform retrieval based on the selected input mode:
+      - If search_mode is "Image", use the uploaded image to generate a caption, then extract features
+        and search in the "image/" folder.
+      - If search_mode is "Text", use the provided text to extract features and search in the "image/" folder.
+    """
+    if search_mode == "Image":
+        if image_input is None:
+            return text_input, gr.update(choices=[]), "Please upload an image.", "", "", "", "", "", "", ""
+        caption = generate_caption(image_input)
+        text_to_use = caption
+        reference_folder = "image/"
+    elif search_mode == "Text":
+        if not text_input or text_input.strip() == "":
+            return "Describe the music you're looking for (in any language)", gr.update(choices=[]), "Please enter text for retrieval.", "", "", "", "", "", "", ""
+        text_to_use = text_input
+        reference_folder = "text/"
+    else:
+        return "Describe the music you're looking for (in any language)", gr.update(choices=[]), "Invalid search mode selected.", "", "", "", "", "", "", ""
+    try:
+        output_file = extract_features_from_text(text_to_use)
+        query_file = FileWrapper(output_file)
+    except Exception as e:
+        return text_to_use, gr.update(choices=[]), f"Error during feature extraction: {e}", "", "", "", "", "", "", ""
+    candidate_ids, candidate_display = find_top_similar(query_file, reference_folder)
+    if not candidate_ids:
+        return text_to_use, gr.update(choices=[]), "", "", "", "", "", "", "", ""
+    choices = [(f"{i+1}. {disp}", cid) for i, (cid, disp) in enumerate(zip(candidate_ids, candidate_display))]
+    top_candidate = candidate_ids[0]
+    details = show_details(top_candidate)
+    return text_to_use, gr.update(choices=choices), *details
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    gr.HTML(badges)
+    gr.Markdown(description)
+    gr.HTML(
+        """
+        <style>
+          .vertical-radio .gradio-radio label {
+              display: block !important;
+              margin-bottom: 5px;
+          }
+        </style>
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            search_mode = gr.Radio(
+                choices=["Text", "Image"],
+                label="Select Search Mode",
+                value="Text",
+                interactive=True,
+                elem_classes=["vertical-radio"]
+            )
+            text_input = gr.Textbox(
+                                    placeholder="Describe the music you're looking for (in any language)",
+                                    lines=4
+                                )
+            image_input = gr.Image(
+                                    label="Or upload an image (PNG, JPG)",
+                                    type="pil"
+                                )
+            search_button = gr.Button("Search")
+            candidate_radio = gr.Radio(choices=[], label="Select Retrieval Result", interactive=True, elem_classes=["vertical-radio"])
+        with gr.Column():
+            gr.Markdown("### YouTube Video")
+            youtube_box = gr.HTML(label="YouTube Video")
+            gr.Markdown("### Metadata")
+            title_box = gr.Textbox(label="Title", interactive=False)
+            artists_box = gr.Textbox(label="Artists", interactive=False)
+            genre_box = gr.Textbox(label="Genre", interactive=False)
+            background_box = gr.Textbox(label="Background", interactive=False)
+            analysis_box = gr.Textbox(label="Analysis", interactive=False)
+            description_box = gr.Textbox(label="Description", interactive=False)
+            scene_box = gr.Textbox(label="Scene", interactive=False)
+    search_button.click(
+        fn=search_wrapper,
+        inputs=[search_mode, text_input, image_input],
+        outputs=[text_input, candidate_radio, title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box]
+    )
+    candidate_radio.change(
+        fn=show_details,
+        inputs=candidate_radio,
+        outputs=[title_box, artists_box, genre_box, background_box, analysis_box, description_box, scene_box, youtube_box]
+    )
+demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,79 @@

+EVAL_SPLIT = 0.01  # Fraction of training data used for evaluation
+WANDB_KEY = "<YOUR_WANDB_KEY>"  # Weights and Biases API key
+# -------------------- Configuration for M3 Training --------------------
+M3_TRAIN_FOLDERS = [
+    "<YOUR_TRAINING_DATA_FOLDER>"  # Directory containing training data for M3
+]
+M3_EVAL_FOLDERS = [
+    "<YOUR_EVALUATION_DATA_FOLDER>"  # Directory containing evaluation data for M3 (optional)
+]
+PATCH_SIZE = 64  # Size of each patch
+PATCH_LENGTH = 512  # Length of the patches
+PATCH_NUM_LAYERS = 12  # Number of layers in the encoder
+TOKEN_NUM_LAYERS = 3  # Number of layers in the decoder
+M3_HIDDEN_SIZE = 768  # Size of the hidden layer
+M3_NUM_EPOCH = 100  # Maximum number of epochs for training
+M3_LEARNING_RATE = 1e-4  # Learning rate for the optimizer
+M3_BATCH_SIZE = 16  # Batch size per GPU (single card) during training
+M3_MASK_RATIO = 0.45  # Ratio of masked elements during training
+M3_DETERMINISTIC = True  # Ensures deterministic results with random seeds
+M3_WANDB_LOG = True  # Enable logging to Weights and Biases
+M3_LOAD_CKPT = True  # Load model weights from a checkpoint if available
+M3_WEIGHTS_PATH = (
+    "weights_m3"+
+    "_h_size_" + str(M3_HIDDEN_SIZE) +
+    "_t_layers_" + str(TOKEN_NUM_LAYERS) +
+    "_p_layers_" + str(PATCH_NUM_LAYERS) +
+    "_p_size_" + str(PATCH_SIZE) +
+    "_p_length_" + str(PATCH_LENGTH) +
+    "_lr_" + str(M3_LEARNING_RATE) +
+    "_batch_" + str(M3_BATCH_SIZE) +
+    "_mask_" + str(M3_MASK_RATIO) + ".pth"
+)  # Path to store the model weights
+M3_LOGS_PATH = M3_WEIGHTS_PATH.replace("weights", "logs").replace("pth", "txt")  # Path to save training logs
+# -------------------- Configuration for CLaMP3 Training ----------------
+CLAMP3_TRAIN_JSONL = "<YOUR_TRAINING_JSONL_FILE>"  # Path to the JSONL file with training data for CLaMP3
+CLAMP3_EVAL_JSONL = "<YOUR_EVALUATION_JSONL_FILE>"  # Path to the JSONL file with evaluation data for CLaMP3 (optional)
+CLAMP3_HIDDEN_SIZE = 768  # Size of the hidden layer
+TEXT_MODEL_NAME = "FacebookAI/xlm-roberta-base"  # Name of the pre-trained text model
+MAX_TEXT_LENGTH = 128  # Maximum allowed length for text input
+AUDIO_HIDDEN_SIZE = 768  # Size of the hidden layer for audio features
+AUDIO_NUM_LAYERS = 12  # Number of layers in the audio encoder
+MAX_AUDIO_LENGTH = 128  # Maximum allowed length for audio input
+CLAMP3_NUM_EPOCH = 100  # Maximum number of epochs for training
+CLAMP3_LEARNING_RATE = 1e-5  # Learning rate for the optimizer
+CLAMP3_BATCH_SIZE = 256  # Batch size per GPU (single card) during training
+LOGIT_SCALE = 1  # Scaling factor for contrastive loss
+FREEZE_TEXT = False  # Freeze the weights of the text model and text projection layer
+TEXT_DROPOUT = True  # Whether to apply dropout during text processing
+CLAMP3_DETERMINISTIC = True  # Ensures deterministic results with random seeds
+CLAMP3_LOAD_M3 = True  # Load weights from the M3 model
+CLAMP3_WANDB_LOG = True  # Enable logging to Weights and Biases
+CLAMP3_LOAD_CKPT = True  # Load weights from a checkpoint if available
+SAVE_EVERY = 5  # Save model weights every SAVE_EVERY epochs
+CLAMP3_WEIGHTS_PATH = (
+    "weights_clamp3_saas" +
+    "_h_size_" + str(CLAMP3_HIDDEN_SIZE) +
+    "_t_model_" + TEXT_MODEL_NAME.replace("/", "_") +
+    "_t_length_" + str(MAX_TEXT_LENGTH) +
+    "_a_size_" + str(AUDIO_HIDDEN_SIZE) +
+    "_a_layers_" + str(AUDIO_NUM_LAYERS) +
+    "_a_length_" + str(MAX_AUDIO_LENGTH) +
+    "_s_size_" + str(M3_HIDDEN_SIZE) +
+    "_s_layers_" + str(PATCH_NUM_LAYERS) +
+    "_p_size_" + str(PATCH_SIZE) +
+    "_p_length_" + str(PATCH_LENGTH) + ".pth"
+)  # Path to store CLaMP3 model weights
+CLAMP3_LOGS_PATH = CLAMP3_WEIGHTS_PATH.replace("weights", "logs").replace("pth", "txt")  # Path to save training logs

extract_clamp3.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import torch
+import numpy as np
+from tqdm import tqdm
+from config import *
+from utils import *
+from samplings import *
+from accelerate import Accelerator
+from transformers import BertConfig, AutoTokenizer
+import argparse
+# Parse command-line arguments
+parser = argparse.ArgumentParser(description="Feature extraction for CLaMP3.")
+parser.add_argument("--epoch", type=str, default=None, help="Epoch of the checkpoint to load.")
+parser.add_argument("input_dir", type=str, help="Directory containing input data files.")
+parser.add_argument("output_dir", type=str, help="Directory to save the output features.")
+parser.add_argument("--get_global", action="store_true", help="Get global feature.")
+args = parser.parse_args()
+# Retrieve arguments
+epoch = args.epoch
+input_dir = args.input_dir
+output_dir = args.output_dir
+get_global = args.get_global
+files = []
+for root, dirs, fs in os.walk(input_dir):
+    for f in fs:
+        if f.endswith(".txt") or f.endswith(".abc") or f.endswith(".mtf") or f.endswith(".npy"):
+            files.append(os.path.join(root, f))
+print(f"Found {len(files)} files in total")
+# Initialize accelerator and device
+accelerator = Accelerator()
+device = accelerator.device
+print("Using device:", device)
+# Model and configuration setup
+audio_config = BertConfig(vocab_size=1,
+                        hidden_size=AUDIO_HIDDEN_SIZE,
+                        num_hidden_layers=AUDIO_NUM_LAYERS,
+                        num_attention_heads=AUDIO_HIDDEN_SIZE//64,
+                        intermediate_size=AUDIO_HIDDEN_SIZE*4,
+                        max_position_embeddings=MAX_AUDIO_LENGTH)
+symbolic_config = BertConfig(vocab_size=1,
+                            hidden_size=M3_HIDDEN_SIZE,
+                            num_hidden_layers=PATCH_NUM_LAYERS,
+                            num_attention_heads=M3_HIDDEN_SIZE//64,
+                            intermediate_size=M3_HIDDEN_SIZE*4,
+                            max_position_embeddings=PATCH_LENGTH)
+model = CLaMP3Model(audio_config=audio_config,
+                    symbolic_config=symbolic_config,
+                    text_model_name=TEXT_MODEL_NAME,
+                    hidden_size=CLAMP3_HIDDEN_SIZE,
+                    load_m3=CLAMP3_LOAD_M3)
+model = model.to(device)
+tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_NAME)
+patchilizer = M3Patchilizer()
+# print parameter number
+print("Total Parameter Number: "+str(sum(p.numel() for p in model.parameters())))
+# Load model weights
+model.eval()
+checkpoint_path = CLAMP3_WEIGHTS_PATH
+if epoch is not None:
+    checkpoint_path = CLAMP3_WEIGHTS_PATH.replace(".pth", f"_{epoch}.pth")
+checkpoint = torch.load(checkpoint_path, map_location="cpu", weights_only=True)
+print(f"Successfully Loaded CLaMP 3 Checkpoint from Epoch {checkpoint['epoch']} with loss {checkpoint['min_eval_loss']}")
+model.load_state_dict(checkpoint['model'])
+def extract_feature(filename, get_global=get_global):
+    if not filename.endswith(".npy"):
+        with open(filename, "r", encoding="utf-8") as f:
+            item = f.read()
+    if filename.endswith(".txt"):
+        item = list(set(item.split("\n")))
+        item = "\n".join(item)
+        item = item.split("\n")
+        item = [c for c in item if len(c) > 0]
+        item = tokenizer.sep_token.join(item)
+        input_data = tokenizer(item, return_tensors="pt")
+        input_data = input_data['input_ids'].squeeze(0)
+        max_input_length = MAX_TEXT_LENGTH
+    elif filename.endswith(".abc") or filename.endswith(".mtf"):
+        input_data = patchilizer.encode(item, add_special_patches=True)
+        input_data = torch.tensor(input_data)
+        max_input_length = PATCH_LENGTH
+    elif filename.endswith(".npy"):
+        input_data = np.load(filename)
+        input_data = torch.tensor(input_data)
+        input_data = input_data.reshape(-1, input_data.size(-1))
+        zero_vec = torch.zeros((1, input_data.size(-1)))
+        input_data = torch.cat((zero_vec, input_data, zero_vec), 0)
+        max_input_length = MAX_AUDIO_LENGTH
+    else:
+        raise ValueError(f"Unsupported file type: {filename}, only support .txt, .abc, .mtf, .npy files")
+    segment_list = []
+    for i in range(0, len(input_data), max_input_length):
+        segment_list.append(input_data[i:i+max_input_length])
+    segment_list[-1] = input_data[-max_input_length:]
+    last_hidden_states_list = []
+    for input_segment in segment_list:
+        input_masks = torch.tensor([1]*input_segment.size(0))
+        if filename.endswith(".txt"):
+            pad_indices = torch.ones(MAX_TEXT_LENGTH - input_segment.size(0)).long() * tokenizer.pad_token_id
+        elif filename.endswith(".abc") or filename.endswith(".mtf"):
+            pad_indices = torch.ones((PATCH_LENGTH - input_segment.size(0), PATCH_SIZE)).long() * patchilizer.pad_token_id
+        else:
+            pad_indices = torch.ones((MAX_AUDIO_LENGTH - input_segment.size(0), AUDIO_HIDDEN_SIZE)).float() * 0.
+        input_masks = torch.cat((input_masks, torch.zeros(max_input_length - input_segment.size(0))), 0)
+        input_segment = torch.cat((input_segment, pad_indices), 0)
+        if filename.endswith(".txt"):
+            last_hidden_states = model.get_text_features(text_inputs=input_segment.unsqueeze(0).to(device),
+                                                         text_masks=input_masks.unsqueeze(0).to(device),
+                                                         get_global=get_global)
+        elif filename.endswith(".abc") or filename.endswith(".mtf"):
+            last_hidden_states = model.get_symbolic_features(symbolic_inputs=input_segment.unsqueeze(0).to(device),
+                                                          symbolic_masks=input_masks.unsqueeze(0).to(device),
+                                                          get_global=get_global)
+        else:
+            last_hidden_states = model.get_audio_features(audio_inputs=input_segment.unsqueeze(0).to(device),
+                                                          audio_masks=input_masks.unsqueeze(0).to(device),
+                                                          get_global=get_global)
+        if not get_global:
+            last_hidden_states = last_hidden_states[:, :input_masks.sum().long().item(), :]
+        last_hidden_states_list.append(last_hidden_states)
+    if not get_global:
+        last_hidden_states_list = [last_hidden_states[0] for last_hidden_states in last_hidden_states_list]
+        last_hidden_states_list[-1] = last_hidden_states_list[-1][-(len(input_data)%max_input_length):]
+        last_hidden_states_list = torch.concat(last_hidden_states_list, 0)
+    else:
+        full_chunk_cnt = len(input_data) // max_input_length
+        remain_chunk_len = len(input_data) % max_input_length
+        if remain_chunk_len == 0:
+            feature_weights = torch.tensor([max_input_length] * full_chunk_cnt, device=device).view(-1, 1)
+        else:
+            feature_weights = torch.tensor([max_input_length] * full_chunk_cnt + [remain_chunk_len], device=device).view(-1, 1)
+        last_hidden_states_list = torch.concat(last_hidden_states_list, 0)
+        last_hidden_states_list = last_hidden_states_list * feature_weights
+        last_hidden_states_list = last_hidden_states_list.sum(dim=0) / feature_weights.sum()
+    return last_hidden_states_list
+def process_directory(input_dir, output_dir, files):
+    # calculate the number of files to process per GPU
+    num_files_per_gpu = len(files) // accelerator.num_processes
+    # calculate the start and end index for the current GPU
+    start_idx = accelerator.process_index * num_files_per_gpu
+    end_idx = start_idx + num_files_per_gpu
+    if accelerator.process_index == accelerator.num_processes - 1:
+        end_idx = len(files)
+    files_to_process = files[start_idx:end_idx]
+    # process the files
+    for file in tqdm(files_to_process):
+        output_subdir = output_dir + os.path.dirname(file)[len(input_dir):]
+        try:
+            os.makedirs(output_subdir, exist_ok=True)
+        except Exception as e:
+            print(output_subdir + " can not be created\n" + str(e))
+        output_file = os.path.join(output_subdir, os.path.splitext(os.path.basename(file))[0] + ".npy")
+        if os.path.exists(output_file):
+            print(f"Skipping {file}, output already exists")
+            continue
+        try:
+            with torch.no_grad():
+                features = extract_feature(file).unsqueeze(0)
+            np.save(output_file, features.detach().cpu().numpy())
+        except Exception as e:
+            print(f"Failed to process {file}: {e}")
+# process the files
+process_directory(input_dir, output_dir, files)

features.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:60273370643e51e092a466d0e9a28041cfc944b2d0b55f6fbe926081ce1ff570
+size 6242016

requirements.txt ADDED Viewed

	@@ -0,0 +1,72 @@

+# PyTorch (CPU-only version)
+torch==2.4.0
+torchvision==0.19.0
+torchaudio==2.4.0
+-f https://download.pytorch.org/whl/cpu
+# Core dependencies
+numpy==1.26.4
+scipy==1.14.1
+scikit-learn==1.5.1
+pandas==1.3.5
+tqdm==4.66.5
+requests==2.32.3
+pillow==9.5.0
+pyyaml==6.0.1
+typing-extensions==4.12.2
+# Transformers and optimization
+transformers==4.40.0
+optimum==1.21.4
+tokenizers==0.19.1
+sentencepiece==0.2.0
+safetensors==0.4.4
+accelerate==0.34.0
+# Audio processing
+librosa==0.10.1
+soundfile==0.12.1
+pydub==0.25.1
+soxr==0.5.0.post1
+audioread==3.0.1
+nnAudio==0.3.3
+# MIDI and music processing
+mido==1.3.0
+music21==7.3.3
+abctoolkit==0.0.4
+# Natural language processing and text utilities
+nltk==3.8.1
+sacrebleu==2.4.3
+sacremoses==0.0.53
+langdetect==1.0.9
+langid==1.1.6
+language-data==1.2.0
+regex==2023.8.8
+unidecode==1.3.6
+# Hugging Face Hub
+huggingface-hub==0.24.6
+datasets==2.21.0
+# Logging and tracking
+wandb==0.17.8
+setproctitle==1.3.3
+sentry-sdk==2.13.0
+# Utilities
+protobuf==5.28.0
+filelock==3.12.2
+tabulate==0.9.0
+dill==0.3.8
+fsspec==2024.6.1
+xxhash==3.5.0
+gitpython==3.1.43
+certifi==2023.7.22
+charset-normalizer==3.2.0
+urllib3==2.0.4
+yarl==1.9.7
+idna==3.4
+samplings==0.1.7
+six==1.16.0

utils.py ADDED Viewed

	@@ -0,0 +1,574 @@

+import re
+import os
+import math
+import torch
+import random
+from config import *
+from unidecode import unidecode
+from torch.nn import functional as F
+from transformers import AutoModel, BertModel, GPT2LMHeadModel, PreTrainedModel, GPT2Config
+try:
+    import torch.distributed.nn
+    from torch import distributed as dist
+    has_distributed = True
+except ImportError:
+    has_distributed = False
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+class ClipLoss(torch.nn.Module):
+    def __init__(
+            self,
+            local_loss=False,
+            gather_with_grad=False,
+            cache_labels=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False,
+    ):
+        super().__init__()
+        self.local_loss = local_loss
+        self.gather_with_grad = gather_with_grad
+        self.cache_labels = cache_labels
+        self.rank = rank
+        self.world_size = world_size
+        self.use_horovod = use_horovod
+        # cache state
+        self.prev_num_logits = 0
+        self.labels = {}
+    def gather_features(
+            self,
+            image_features,
+            text_features,
+            local_loss=False,
+            gather_with_grad=False,
+            rank=0,
+            world_size=1,
+            use_horovod=False
+    ):
+        assert has_distributed, 'torch.distributed did not import correctly, please use a PyTorch version with support.'
+        if use_horovod:
+            assert hvd is not None, 'Please install horovod'
+            if gather_with_grad:
+                all_image_features = hvd.allgather(image_features)
+                all_text_features = hvd.allgather(text_features)
+            else:
+                with torch.no_grad():
+                    all_image_features = hvd.allgather(image_features)
+                    all_text_features = hvd.allgather(text_features)
+                if not local_loss:
+                    # ensure grads for local rank when all_* features don't have a gradient
+                    gathered_image_features = list(all_image_features.chunk(world_size, dim=0))
+                    gathered_text_features = list(all_text_features.chunk(world_size, dim=0))
+                    gathered_image_features[rank] = image_features
+                    gathered_text_features[rank] = text_features
+                    all_image_features = torch.cat(gathered_image_features, dim=0)
+                    all_text_features = torch.cat(gathered_text_features, dim=0)
+        else:
+            # We gather tensors from all gpus
+            if gather_with_grad:
+                all_image_features = torch.cat(torch.distributed.nn.all_gather(image_features), dim=0)
+                all_text_features = torch.cat(torch.distributed.nn.all_gather(text_features), dim=0)
+            else:
+                gathered_image_features = [torch.zeros_like(image_features) for _ in range(world_size)]
+                gathered_text_features = [torch.zeros_like(text_features) for _ in range(world_size)]
+                dist.all_gather(gathered_image_features, image_features)
+                dist.all_gather(gathered_text_features, text_features)
+                if not local_loss:
+                    # ensure grads for local rank when all_* features don't have a gradient
+                    gathered_image_features[rank] = image_features
+                    gathered_text_features[rank] = text_features
+                all_image_features = torch.cat(gathered_image_features, dim=0)
+                all_text_features = torch.cat(gathered_text_features, dim=0)
+        return all_image_features, all_text_features
+    def get_ground_truth(self, device, num_logits) -> torch.Tensor:
+        # calculated ground-truth and cache if enabled
+        if self.prev_num_logits != num_logits or device not in self.labels:
+            labels = torch.arange(num_logits, device=device, dtype=torch.long)
+            if self.world_size > 1 and self.local_loss:
+                labels = labels + num_logits * self.rank
+            if self.cache_labels:
+                self.labels[device] = labels
+                self.prev_num_logits = num_logits
+        else:
+            labels = self.labels[device]
+        return labels
+    def get_logits(self, image_features, text_features, logit_scale):
+        if self.world_size > 1:
+            all_image_features, all_text_features = self.gather_features(
+                image_features, text_features,
+                self.local_loss, self.gather_with_grad, self.rank, self.world_size, self.use_horovod)
+            if self.local_loss:
+                logits_per_image = logit_scale * image_features @ all_text_features.T
+                logits_per_text = logit_scale * text_features @ all_image_features.T
+            else:
+                logits_per_image = logit_scale * all_image_features @ all_text_features.T
+                logits_per_text = logits_per_image.T
+        else:
+            logits_per_image = logit_scale * image_features @ text_features.T
+            logits_per_text = logit_scale * text_features @ image_features.T
+        return logits_per_image, logits_per_text
+    def forward(self, image_features, text_features, logit_scale, output_dict=False):
+        device = image_features.device
+        logits_per_image, logits_per_text = self.get_logits(image_features, text_features, logit_scale)
+        labels = self.get_ground_truth(device, logits_per_image.shape[0])
+        total_loss = (
+            F.cross_entropy(logits_per_image, labels) +
+            F.cross_entropy(logits_per_text, labels)
+        ) / 2
+        return {"contrastive_loss": total_loss} if output_dict else total_loss
+class M3Patchilizer:
+    def __init__(self):
+        self.delimiters = ["|:", "::", ":|", "[|", "||", "|]", "|"]
+        self.regexPattern = '(' + '|'.join(map(re.escape, self.delimiters)) + ')'
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def split_bars(self, body):
+        bars = re.split(self.regexPattern, ''.join(body))
+        bars = list(filter(None, bars))  # remove empty strings
+        if bars[0] in self.delimiters:
+            bars[1] = bars[0] + bars[1]
+            bars = bars[1:]
+        bars = [bars[i * 2] + bars[i * 2 + 1] for i in range(len(bars) // 2)]
+        return bars
+    def bar2patch(self, bar, patch_size=PATCH_SIZE):
+        patch = [self.bos_token_id] + [ord(c) for c in bar] + [self.eos_token_id]
+        patch = patch[:patch_size]
+        patch += [self.pad_token_id] * (patch_size - len(patch))
+        return patch
+    def patch2bar(self, patch):
+        return ''.join(chr(idx) if idx > self.mask_token_id else '' for idx in patch)
+    def encode(self,
+               item,
+               patch_size=PATCH_SIZE,
+               add_special_patches=False,
+               truncate=False,
+               random_truncate=False):
+        item = item.replace("L:1/8\n", "")
+        item = unidecode(item)
+        lines = re.findall(r'.*?\n|.*$', item)
+        lines = list(filter(None, lines))  # remove empty lines
+        patches = []
+        if lines[0].split(" ")[0] == "ticks_per_beat":
+            patch = ""
+            for line in lines:
+                if patch.startswith(line.split(" ")[0]) and (len(patch) + len(" ".join(line.split(" ")[1:])) <= patch_size-2):
+                    patch = patch[:-1] + "\t" + " ".join(line.split(" ")[1:])
+                else:
+                    if patch:
+                        patches.append(patch)
+                    patch = line
+            if patch!="":
+                patches.append(patch)
+        else:
+            for line in lines:
+                if len(line) > 1 and ((line[0].isalpha() and line[1] == ':') or line.startswith('%%')):
+                    patches.append(line)
+                else:
+                    bars = self.split_bars(line)
+                    if bars:
+                        bars[-1] += '\n'
+                        patches.extend(bars)
+        if add_special_patches:
+            bos_patch = chr(self.bos_token_id) * patch_size
+            eos_patch = chr(self.eos_token_id) * patch_size
+            patches = [bos_patch] + patches + [eos_patch]
+        if len(patches) > PATCH_LENGTH and truncate:
+            choices = ["head", "tail", "middle"]
+            choice = random.choice(choices)
+            if choice=="head" or random_truncate==False:
+                patches = patches[:PATCH_LENGTH]
+            elif choice=="tail":
+                patches = patches[-PATCH_LENGTH:]
+            else:
+                start = random.randint(1, len(patches)-PATCH_LENGTH)
+                patches = patches[start:start+PATCH_LENGTH]
+        patches = [self.bar2patch(patch) for patch in patches]
+        return patches
+    def decode(self, patches):
+        return ''.join(self.patch2bar(patch) for patch in patches)
+class M3PatchEncoder(PreTrainedModel):
+    def __init__(self, config):
+        super(M3PatchEncoder, self).__init__(config)
+        self.patch_embedding = torch.nn.Linear(PATCH_SIZE*128, M3_HIDDEN_SIZE)
+        torch.nn.init.normal_(self.patch_embedding.weight, std=0.02)
+        self.base = BertModel(config=config)
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def forward(self,
+                input_patches, # [batch_size, seq_length, hidden_size]
+                input_masks):  # [batch_size, seq_length]
+        # Transform input_patches into embeddings
+        input_patches = torch.nn.functional.one_hot(input_patches, num_classes=128)
+        input_patches = input_patches.reshape(len(input_patches), -1, PATCH_SIZE*128).type(torch.FloatTensor)
+        input_patches = self.patch_embedding(input_patches.to(self.device))
+        # Apply BERT model to input_patches and input_masks
+        return self.base(inputs_embeds=input_patches, attention_mask=input_masks)
+class M3TokenDecoder(PreTrainedModel):
+    def __init__(self, config):
+        super(M3TokenDecoder, self).__init__(config)
+        self.base = GPT2LMHeadModel(config=config)
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def forward(self,
+                patch_features,  # [batch_size, hidden_size]
+                target_patches): # [batch_size, seq_length]
+        # get input embeddings
+        inputs_embeds = torch.nn.functional.embedding(target_patches, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        inputs_embeds = torch.cat((patch_features.unsqueeze(1), inputs_embeds[:,1:,:]), dim=1)
+        # preparing the labels for model training
+        target_masks = target_patches == self.pad_token_id
+        target_patches = target_patches.clone().masked_fill_(target_masks, -100)
+        # get the attention mask
+        target_masks = ~target_masks
+        target_masks = target_masks.type(torch.int)
+        return self.base(inputs_embeds=inputs_embeds,
+                         attention_mask=target_masks,
+                         labels=target_patches)
+    def generate(self,
+                 patch_feature,
+                 tokens):
+        # reshape the patch_feature and tokens
+        patch_feature = patch_feature.reshape(1, 1, -1)
+        tokens = tokens.reshape(1, -1)
+        # get input embeddings
+        tokens = torch.nn.functional.embedding(tokens, self.base.transformer.wte.weight)
+        # concatenate the encoded patches with the input embeddings
+        tokens = torch.cat((patch_feature, tokens[:,1:,:]), dim=1)
+        # get the outputs from the model
+        outputs = self.base(inputs_embeds=tokens)
+        # get the probabilities of the next token
+        probs = torch.nn.functional.softmax(outputs.logits.squeeze(0)[-1], dim=-1)
+        return probs.detach().cpu().numpy()
+class M3Model(PreTrainedModel):
+    def __init__(self, encoder_config, decoder_config):
+        super(M3Model, self).__init__(encoder_config)
+        self.encoder = M3PatchEncoder(encoder_config)
+        self.decoder = M3TokenDecoder(decoder_config)
+        self.pad_token_id = 0
+        self.bos_token_id = 1
+        self.eos_token_id = 2
+        self.mask_token_id = 3
+    def forward(self,
+                input_patches,      # [batch_size, seq_length, hidden_size]
+                input_masks,        # [batch_size, seq_length]
+                selected_indices,   # [batch_size, seq_length]
+                target_patches):    # [batch_size, seq_length, hidden_size]
+        input_patches = input_patches.reshape(len(input_patches), -1, PATCH_SIZE).to(self.device)
+        input_masks = input_masks.to(self.device)
+        selected_indices = selected_indices.to(self.device)
+        target_patches = target_patches.reshape(len(target_patches), -1, PATCH_SIZE).to(self.device)
+        # Pass the input_patches and input_masks through the encoder
+        outputs = self.encoder(input_patches, input_masks)["last_hidden_state"]
+        # Use selected_indices to form target_patches
+        target_patches = target_patches[selected_indices.bool()]
+        patch_features = outputs[selected_indices.bool()]
+        # Pass patch_features and target_patches through the decoder
+        return self.decoder(patch_features, target_patches)
+class CLaMP3Model(PreTrainedModel):
+    def __init__(self,
+                 audio_config,
+                 symbolic_config,
+                 global_rank=None,
+                 world_size=None,
+                 text_model_name=TEXT_MODEL_NAME,
+                 hidden_size=CLAMP3_HIDDEN_SIZE,
+                 load_m3=CLAMP3_LOAD_M3):
+        super(CLaMP3Model, self).__init__(symbolic_config)
+        self.text_model = AutoModel.from_pretrained(text_model_name) # Load the text model
+        self.text_proj = torch.nn.Linear(self.text_model.config.hidden_size, hidden_size) # Linear layer for text projections
+        torch.nn.init.normal_(self.text_proj.weight, std=0.02) # Initialize weights with normal distribution
+        self.symbolic_model = M3PatchEncoder(symbolic_config) # Initialize the symbolic model
+        self.symbolic_proj = torch.nn.Linear(M3_HIDDEN_SIZE, hidden_size) # Linear layer for symbolic projections
+        torch.nn.init.normal_(self.symbolic_proj.weight, std=0.02) # Initialize weights with normal distribution
+        self.audio_model = BertModel(audio_config) # Initialize the audio model
+        self.audio_proj = torch.nn.Linear(audio_config.hidden_size, hidden_size) # Linear layer for audio projections
+        torch.nn.init.normal_(self.audio_proj.weight, std=0.02) # Initialize weights with normal distribution
+        if global_rank==None or world_size==None:
+            global_rank = 0
+            world_size = 1
+        self.loss_fn = ClipLoss(local_loss=False,
+                                gather_with_grad=True,
+                                cache_labels=False,
+                                rank=global_rank,
+                                world_size=world_size,
+                                use_horovod=False)
+        if load_m3 and os.path.exists(M3_WEIGHTS_PATH):
+            checkpoint = torch.load(M3_WEIGHTS_PATH, map_location='cpu', weights_only=True)
+            decoder_config = GPT2Config(vocab_size=128,
+                            n_positions=PATCH_SIZE,
+                            n_embd=M3_HIDDEN_SIZE,
+                            n_layer=TOKEN_NUM_LAYERS,
+                            n_head=M3_HIDDEN_SIZE//64,
+                            n_inner=M3_HIDDEN_SIZE*4)
+            model = M3Model(symbolic_config, decoder_config)
+            model.load_state_dict(checkpoint['model'])
+            self.symbolic_model = model.encoder
+            model = None
+            print(f"Successfully Loaded M3 Checkpoint from Epoch {checkpoint['epoch']} with loss {checkpoint['min_eval_loss']}")
+    def set_trainable(self, freeze_list):
+        if "text_model" in freeze_list:
+            self.text_model.eval()
+            for param in self.text_model.parameters():
+                param.requires_grad = False
+            print("Text Model Frozen")
+        else:
+            self.text_model.train()
+            for param in self.text_model.parameters():
+                param.requires_grad = True
+            print("Text Model Training")
+        if "text_proj" in freeze_list:
+            self.text_proj.eval()
+            for param in self.text_proj.parameters():
+                param.requires_grad = False
+            print("Text Projection Layer Frozen")
+        else:
+            self.text_proj.train()
+            for param in self.text_proj.parameters():
+                param.requires_grad = True
+            print("Text Projection Layer Training")
+        if "symbolic_model" in freeze_list:
+            self.symbolic_model.eval()
+            for param in self.symbolic_model.parameters():
+                param.requires_grad = False
+            print("Symbolic Model Frozen")
+        else:
+            self.symbolic_model.train()
+            for param in self.symbolic_model.parameters():
+                param.requires_grad = True
+            print("Symbolic Model Training")
+        if "symbolic_proj" in freeze_list:
+            self.symbolic_proj.eval()
+            for param in self.symbolic_proj.parameters():
+                param.requires_grad = False
+            print("Symbolic Projection Layer Frozen")
+        else:
+            self.symbolic_proj.train()
+            for param in self.symbolic_proj.parameters():
+                param.requires_grad = True
+            print("Symbolic Projection Layer Training")
+        if "audio_model" in freeze_list:
+            self.audio_model.eval()
+            for param in self.audio_model.parameters():
+                param.requires_grad = False
+            print("Audio Model Frozen")
+        else:
+            self.audio_model.train()
+            for param in self.audio_model.parameters():
+                param.requires_grad = True
+            print("Audio Model Training")
+        if "audio_proj" in freeze_list:
+            self.audio_proj.eval()
+            for param in self.audio_proj.parameters():
+                param.requires_grad = False
+            print("Audio Projection Layer Frozen")
+        else:
+            self.audio_proj.train()
+            for param in self.audio_proj.parameters():
+                param.requires_grad = True
+            print("Audio Projection Layer Training")
+    def avg_pooling(self, input_features, input_masks):
+        input_masks = input_masks.unsqueeze(-1).to(self.device) # add a dimension to match the feature dimension
+        input_features = input_features * input_masks # apply mask to input_features
+        avg_pool = input_features.sum(dim=1) / input_masks.sum(dim=1) # calculate average pooling
+        return avg_pool
+    def get_text_features(self,
+                          text_inputs,
+                          text_masks,
+                          get_global=False):
+        text_features = self.text_model(text_inputs.to(self.device),
+                                        attention_mask=text_masks.to(self.device))['last_hidden_state']
+        if get_global:
+            text_features = self.avg_pooling(text_features, text_masks)
+            text_features = self.text_proj(text_features)
+        return text_features
+    def get_symbolic_features(self,
+                              symbolic_inputs,
+                              symbolic_masks,
+                              get_global=False):
+        symbolic_features = self.symbolic_model(symbolic_inputs.to(self.device),
+                                                symbolic_masks.to(self.device))['last_hidden_state']
+        if get_global:
+            symbolic_features = self.avg_pooling(symbolic_features, symbolic_masks)
+            symbolic_features = self.symbolic_proj(symbolic_features)
+        return symbolic_features
+    def get_audio_features(self,
+                           audio_inputs,
+                           audio_masks,
+                           get_global=False):
+        audio_features = self.audio_model(inputs_embeds=audio_inputs.to(self.device),
+                                          attention_mask=audio_masks.to(self.device))['last_hidden_state']
+        if get_global:
+            audio_features = self.avg_pooling(audio_features, audio_masks)
+            audio_features = self.audio_proj(audio_features)
+        return audio_features
+    def forward(self,
+                text_inputs,     # [batch_size, seq_length]
+                text_masks,      # [batch_size, seq_length]
+                music_inputs,    # [batch_size, seq_length, hidden_size]
+                music_masks,     # [batch_size, seq_length]
+                music_modality): # "symbolic" or "audio"
+        # Compute the text features
+        text_features = self.get_text_features(text_inputs, text_masks, get_global=True)
+        # Compute the music features
+        if music_modality=="symbolic":
+            music_features = self.get_symbolic_features(music_inputs, music_masks, get_global=True)
+        elif music_modality=="audio":
+            music_features = self.get_audio_features(music_inputs, music_masks, get_global=True)
+        else:
+            raise ValueError("music_modality must be either 'symbolic' or 'audio'")
+        return self.loss_fn(text_features,
+                            music_features,
+                            LOGIT_SCALE,
+                            output_dict=False)
+def split_data(data, eval_ratio=EVAL_SPLIT):
+    random.shuffle(data)
+    split_idx = int(len(data)*eval_ratio)
+    eval_set = data[:split_idx]
+    train_set = data[split_idx:]
+    return train_set, eval_set
+def mask_patches(target_patches, patchilizer, mode):
+    indices = list(range(len(target_patches)))
+    random.shuffle(indices)
+    selected_indices = indices[:math.ceil(M3_MASK_RATIO*len(indices))]
+    sorted_indices = sorted(selected_indices)
+    input_patches = torch.tensor(target_patches)
+    if mode=="eval":
+        choice = "original"
+    else:
+        choice = random.choices(["mask", "shuffle", "original"], weights=[0.8, 0.1, 0.1])[0]
+    if choice=="mask":
+        input_patches[sorted_indices] = torch.tensor([patchilizer.mask_token_id]*PATCH_SIZE)
+    elif choice=="shuffle":
+        for idx in sorted_indices:
+            patch = input_patches[idx]
+            try:
+                index_eos = (patch == patchilizer.eos_token_id).nonzero().item()
+            except:
+                index_eos = len(patch)
+            indices = list(range(1, index_eos))
+            random.shuffle(indices)
+            indices = [0] + indices + list(range(index_eos, len(patch)))
+            input_patches[idx] = patch[indices]
+    selected_indices = torch.zeros(len(target_patches))
+    selected_indices[sorted_indices] = 1.
+    return input_patches, selected_indices
+def remove_instrument_info(item):
+    # remove instrument information from symbolic music
+    lines = re.findall(r'.*?\n|.*$', item)
+    lines = list(filter(None, lines))
+    if lines[0].split(" ")[0] == "ticks_per_beat":
+        type = "mtf"
+    else:
+        type = "abc"
+    cleaned_lines = []
+    for line in lines:
+        if type=="abc" and line.startswith("V:"):
+            # find the position of " nm=" or " snm="
+            nm_pos = line.find(" nm=")
+            snm_pos = line.find(" snm=")
+            # keep the part before " nm=" or " snm="
+            if nm_pos != -1:
+                line = line[:nm_pos]
+            elif snm_pos != -1:
+                line = line[:snm_pos]
+            if nm_pos != -1 or snm_pos != -1:
+                line += "\n"
+        elif type=="mtf" and line.startswith("program_change"):
+            line = " ".join(line.split(" ")[:-1]) + " 0\n"
+        cleaned_lines.append(line)
+    return ''.join(cleaned_lines)

wikimt-x-public.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff