|
|
import json, jsonlines, pathlib
|
|
|
import concurrent.futures
|
|
|
from tqdm import tqdm
|
|
|
from datasets import load_dataset
|
|
|
from math import ceil
|
|
|
from pubmed import download_pubmed
|
|
|
|
|
|
OUT = pathlib.Path("data/corpora")
|
|
|
OUT.mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
PUBMED_ARTICLES_PER_XML_FILE = 30000
|
|
|
|
|
|
def write_jsonl(path, rows):
|
|
|
print(f"Writing {len(rows)} records to {path}")
|
|
|
with jsonlines.open(path, "w") as out:
|
|
|
out.write_all(rows)
|
|
|
print(f"Finished writing {path}")
|
|
|
|
|
|
|
|
|
def build_lasseregin():
|
|
|
print("Starting LasseRegin build...")
|
|
|
import urllib.request
|
|
|
url = "https://raw.githubusercontent.com/LasseRegin/medical-question-answer-data/master/icliniqQAs.json"
|
|
|
|
|
|
try:
|
|
|
with urllib.request.urlopen(url) as response:
|
|
|
data = json.loads(response.read().decode("utf-8"))
|
|
|
except Exception as e:
|
|
|
print(f"Failed to download LasseRegin data: {e}")
|
|
|
return
|
|
|
|
|
|
rows = []
|
|
|
for i, r in enumerate(tqdm(data, desc="LasseRegin", leave=False)):
|
|
|
rows.append({
|
|
|
"id": f"icliniq:{i}",
|
|
|
"title": r.get("title",""),
|
|
|
"question": r.get("question",""),
|
|
|
"answer": r.get("answer",""),
|
|
|
"source": "icliniq"
|
|
|
})
|
|
|
write_jsonl(OUT / "medical_qa.jsonl", rows)
|
|
|
print("Completed LasseRegin build.")
|
|
|
|
|
|
|
|
|
def build_miriad(sample_size=200_000):
|
|
|
print(f"Starting MIRIAD build (sample_size={sample_size})...")
|
|
|
try:
|
|
|
ds = load_dataset("miriad/miriad-4.4M", num_proc=4, split="train")
|
|
|
|
|
|
ds = ds.shuffle(seed=42).select(range(min(sample_size, len(ds))))
|
|
|
except Exception as e:
|
|
|
print(f"Failed to load MIRIAD dataset: {e}")
|
|
|
return
|
|
|
|
|
|
rows = []
|
|
|
for i, ex in enumerate(tqdm(ds, desc="miriad", leave=False)):
|
|
|
rows.append({
|
|
|
"id": f"miriad:{i}",
|
|
|
"title": ex.get("paper_title",""),
|
|
|
"question": ex.get("question", ""),
|
|
|
"answer": ex.get("passage_text", ""),
|
|
|
"year": ex.get("year",""),
|
|
|
"specialty": ex.get("specialty",""),
|
|
|
|
|
|
})
|
|
|
write_jsonl(OUT / "miriad_text.jsonl", rows)
|
|
|
print("Completed MIRIAD build.")
|
|
|
|
|
|
|
|
|
def build_pubmed(max_records=500_000):
|
|
|
num_files = int(ceil(max_records / PUBMED_ARTICLES_PER_XML_FILE))
|
|
|
print(f"Starting PubMed build (num_files={num_files}, max_records={max_records})...")
|
|
|
|
|
|
download_pubmed(OUT / "pubmed.jsonl", num_files)
|
|
|
print("Completed PubMed build.")
|
|
|
|
|
|
|
|
|
def build_unidoc(max_items=1000):
|
|
|
print(f"Starting UniDoc build (max_items={max_items})...")
|
|
|
try:
|
|
|
ds = load_dataset("Salesforce/UniDoc-Bench", split="healthcare")
|
|
|
except Exception as e:
|
|
|
print(f"Failed to load UniDoc dataset: {e}")
|
|
|
return
|
|
|
|
|
|
rows = []
|
|
|
for i, ex in enumerate(tqdm(ds, desc="unidoc", leave=False)):
|
|
|
q = ex.get("question","") or ex.get("query","")
|
|
|
a = ex.get("answer","") or ""
|
|
|
pdf = ex.get("pdf_path") or ex.get("document_path") or ""
|
|
|
domain = ex.get("domain","")
|
|
|
rows.append({
|
|
|
"id": f"unidoc:{i}",
|
|
|
"title": f"{domain} PDF",
|
|
|
"question": q,
|
|
|
"answer": a,
|
|
|
"pdf_path": pdf
|
|
|
})
|
|
|
if i+1 >= max_items:
|
|
|
break
|
|
|
write_jsonl(OUT / "unidoc_qa.jsonl", rows)
|
|
|
print("Completed UniDoc build.")
|
|
|
|
|
|
def main():
|
|
|
print("Starting parallel corpora build...")
|
|
|
|
|
|
tasks = [
|
|
|
(build_lasseregin, []),
|
|
|
(build_miriad, [1000]),
|
|
|
(build_pubmed, [500_000]),
|
|
|
|
|
|
(build_unidoc, [1000])
|
|
|
]
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
|
|
|
futures = [executor.submit(func, *args) for func, args in tasks]
|
|
|
for future in concurrent.futures.as_completed(futures):
|
|
|
try:
|
|
|
future.result()
|
|
|
except Exception as e:
|
|
|
print(f"A task failed: {e}")
|
|
|
|
|
|
print("✅ All corpora built successfully in data/corpora/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|