Spaces:

marcsixtysix
/

rag_chat_

Sleeping

rag_chat_ / generate_rag_data.py

mryt66

Initial commit

a840639 3 months ago

6.66 kB

	from google import genai
	from typing import List
	from pathlib import Path
	import fitz
	import json
	import os
	import textwrap
	from settings import Chunk, Settings


	SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
	DATA_DIR = os.path.join(SCRIPT_DIR, "data")
	os.makedirs(DATA_DIR, exist_ok=True)

	# Input: put your raw source files (txt/markdown) inside ./data/source
	SOURCE_DIR = os.path.join(DATA_DIR, "source")
	os.makedirs(SOURCE_DIR, exist_ok=True)

	# Output artifact locations (align with api.py expectations)
	OUTPUT_CHUNKS_FILE = os.path.join(
	SCRIPT_DIR, "output_chunks.jsonl"
	) # already used in api.py
	RAG_CONFIG_FILE = os.path.join(
	SCRIPT_DIR, "rag_prompt_config.jsonl"
	) # already used in api.py
	# If you also want these in data/ instead, uncomment:
	# OUTPUT_CHUNKS_FILE = os.path.join(DATA_DIR, "output_chunks.jsonl")
	# RAG_CONFIG_FILE = os.path.join(DATA_DIR, "rag_prompt_config.jsonl")

	# Example system / base prompts (edit as needed)
	SYSTEM_PROMPT = {
	"role": "system",
	"content": "You are a helpful RAG assistant. Use only the provided context. If unsure, say you don't know.",
	}
	BASE_CHUNK = {
	"role": "base",
	"content": "Answer the user's query using only the contextual chunks below.",
	}


	def extract_pdf_text(filename: str) -> str:
	text = ""
	with fitz.open(filename) as doc:
	for page in doc:
	text += page.get_text()
	return text


	def chunk_pdf(filename: str) -> List[Chunk]:
	client = genai.Client()
	text = extract_pdf_text(filename)
	# print(text)
	pdf_name = Path(filename).name

	prompt = f"""
	Split the following text into coherent chunks suitable for RAG.
	Each chunk should be 100-500 words.
	Do not cut mid-sentence, paragraph, or table.
	Preserve headings, bullet points, and tables.

	Return an array of JSON objects with this structure:
	{{
	"content": "<chunk text>",
	"source": "{pdf_name}",
	"tags": [],
	"type": "prg"
	}}
	Text:
	{text}
	"""

	client = genai.Client()
	response = client.models.generate_content(
	model="gemini-2.5-flash",
	contents=prompt,
	config={
	"response_mime_type": "application/json",
	"response_schema": Settings.response_schema,
	},
	)

	chunks: List[Chunk] = response.parsed
	return chunks


	def process_pdf_folder(folder_path):
	folder = Path(folder_path)
	pdfs = list(folder.glob("*.pdf"))
	all_chunks = []
	if not pdfs:
	print(f"No PDF files found in {folder_path}")
	return []
	else:
	pdfs.sort(key=lambda x: x.name)
	for pdf_file in pdfs:
	print(f"Processing PDF: {pdf_file.name}")
	chunks = chunk_pdf(filename=pdf_file)
	all_chunks.extend(chunks)
	return all_chunks


	def make_prg_chunk(text, filename):
	return [
	{
	"content": text.strip(),
	"source": Path(filename).name,
	"tags": [],
	"type": "prg",
	}
	]


	def process_prg_folder(folder_path):
	folder = Path(folder_path)
	all_chunks = []
	prgs = list(folder.glob("*.prg"))
	if not prgs:
	print(f"No .prg files found in {folder_path}")
	return []
	prgs.sort(key=lambda x: x.name)
	for prg_file in prgs:
	print(f"Processing PRG: {prg_file.name}")
	text = prg_file.read_text(encoding="utf-8", errors="ignore")
	chunk = make_prg_chunk(text, prg_file.name)
	all_chunks.extend(chunk)
	return all_chunks


	def read_source_files():
	"""Load all .txt / .md files from SOURCE_DIR."""
	files = []
	for name in os.listdir(SOURCE_DIR):
	if name.lower().endswith((".txt", ".md")):
	path = os.path.join(SOURCE_DIR, name)
	with open(path, "r", encoding="utf-8") as f:
	files.append((name, f.read()))
	if not files:
	# Provide a fallback demo file if none exist
	demo_path = os.path.join(SOURCE_DIR, "demo.txt")
	demo_text = (
	"This is a demo knowledge file.\n"
	"Add your project or domain documentation as .txt or .md files here."
	)
	with open(demo_path, "w", encoding="utf-8") as f:
	f.write(demo_text)
	files.append(("demo.txt", demo_text))
	return files


	def chunk_text(text: str, max_chars: int = 1200, overlap: int = 150):
	"""Simple character-based chunking with overlap."""
	text = text.strip()
	if not text:
	return []
	chunks = []
	start = 0
	while start < len(text):
	end = min(len(text), start + max_chars)
	chunk = text[start:end]
	chunks.append(chunk.strip())
	if end >= len(text):
	break
	start = end - overlap
	if start < 0:
	start = 0
	return chunks


	def build_chunks():
	"""Create chunk objects suitable for embedding."""
	all_files = read_source_files()
	chunks = []
	idx = 0
	for filename, content in all_files:
	parts = chunk_text(content)
	for part in parts:
	chunks.append({"id": idx, "source": filename, "content": part})
	idx += 1
	return chunks


	def write_jsonl(path: str, records):
	with open(path, "w", encoding="utf-8") as f:
	for r in records:
	f.write(json.dumps(r, ensure_ascii=False) + "\n")


	def write_config(path: str):
	"""Write system + base prompt config file (list with single object)."""
	obj = [{"system_prompt": SYSTEM_PROMPT, "base_chunk": BASE_CHUNK}]
	with open(path, "w", encoding="utf-8") as f:
	json.dump(obj, f, ensure_ascii=False, indent=2)


	def main():
	pdf_folder = r"C:\Users\kogut\Python\Assembler_rag\data\pdfs"
	prg_folder = r"C:\Users\kogut\Python\Assembler_rag\data\prg"
	# pdf_folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./data/pdfs")
	# prg_folder = Path(sys.argv[2]) if len(sys.argv) > 2 else None
	output_jsonl = "output_chunks.jsonl"

	all_chunks = process_pdf_folder(pdf_folder)

	if prg_folder:
	all_chunks += process_prg_folder(prg_folder)

	with open(output_jsonl, "w", encoding="utf-8") as f:
	json.dump(all_chunks, f, ensure_ascii=False, indent=2)

	print(f"Finished. {len(all_chunks)} total chunks written to {output_jsonl}")

	print(f"Generating RAG data from: {SOURCE_DIR}")
	chunks = build_chunks()
	print(f"Built {len(chunks)} chunks")
	write_jsonl(OUTPUT_CHUNKS_FILE, chunks)
	write_config(RAG_CONFIG_FILE)
	print(f"Wrote chunks to: {OUTPUT_CHUNKS_FILE}")
	print(f"Wrote config to: {RAG_CONFIG_FILE}")
	print("Done.")


	if __name__ == "__main__":
	main()