Spaces:

marcsixtysix
/

rag_chat_

Sleeping

File size: 6,433 Bytes

7ba3a81
a840639

import google.generativeai as genai
from typing import List
from pathlib import Path
import fitz
import json
import os
import textwrap
from settings import Chunk, Settings


SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))

# Output artifact locations (align with api.py expectations)
OUTPUT_CHUNKS_FILE = os.path.join(
    SCRIPT_DIR, "output_chunks.jsonl"
)  # already used in api.py
RAG_CONFIG_FILE = os.path.join(
    SCRIPT_DIR, "rag_prompt_config.jsonl"
)  # already used in api.py
# If you also want these in data/ instead, uncomment:
# OUTPUT_CHUNKS_FILE = os.path.join(DATA_DIR, "output_chunks.jsonl")
# RAG_CONFIG_FILE = os.path.join(DATA_DIR, "rag_prompt_config.jsonl")

# Example system / base prompts (edit as needed)
SYSTEM_PROMPT = {
    "role": "system",
    "content": "You are a helpful RAG assistant. Use only the provided context. If unsure, say you don't know.",
}
BASE_CHUNK = {
    "role": "base",
    "content": "Answer the user's query using only the contextual chunks below.",
}


def extract_pdf_text(filename: str) -> str:
    text = ""
    with fitz.open(filename) as doc:
        for page in doc:
            text += page.get_text()
    return text


def chunk_pdf(filename: str) -> List[Chunk]:
    client = genai.Client()
    text = extract_pdf_text(filename)
    # print(text)
    pdf_name = Path(filename).name

    prompt = f"""
    Split the following text into coherent chunks suitable for RAG.
    Each chunk should be 100-500 words.
    Do not cut mid-sentence, paragraph, or table.
    Preserve headings, bullet points, and tables.
    
    Return an array of JSON objects with this structure:
    {{
        "content": "<chunk text>",
        "source": "{pdf_name}",
        "tags": [],
        "type": "prg"
    }}
    Text:
    {text}
    """

    client = genai.Client()
    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config={
            "response_mime_type": "application/json",
            "response_schema": Settings.response_schema,
        },
    )

    chunks: List[Chunk] = response.parsed
    return chunks


def process_pdf_folder(folder_path):
    folder = Path(folder_path)
    pdfs = list(folder.glob("*.pdf"))
    all_chunks = []
    if not pdfs:
        print(f"No PDF files found in {folder_path}")
        return []
    else:
        pdfs.sort(key=lambda x: x.name)
        for pdf_file in pdfs:
            print(f"Processing PDF: {pdf_file.name}")
            chunks = chunk_pdf(filename=pdf_file)
            all_chunks.extend(chunks)
    return all_chunks


def make_prg_chunk(text, filename):
    return [
        {
            "content": text.strip(),
            "source": Path(filename).name,
            "tags": [],
            "type": "prg",
        }
    ]


def process_prg_folder(folder_path):
    folder = Path(folder_path)
    all_chunks = []
    prgs = list(folder.glob("*.prg"))
    if not prgs:
        print(f"No .prg files found in {folder_path}")
        return []
    prgs.sort(key=lambda x: x.name)
    for prg_file in prgs:
        print(f"Processing PRG: {prg_file.name}")
        text = prg_file.read_text(encoding="utf-8", errors="ignore")
        chunk = make_prg_chunk(text, prg_file.name)
        all_chunks.extend(chunk)
    return all_chunks


def read_source_files():
    """Load all .txt / .md files from SOURCE_DIR."""
    files = []
    for name in os.listdir(SOURCE_DIR):
        if name.lower().endswith((".txt", ".md")):
            path = os.path.join(SOURCE_DIR, name)
            with open(path, "r", encoding="utf-8") as f:
                files.append((name, f.read()))
    if not files:
        # Provide a fallback demo file if none exist
        demo_path = os.path.join(SOURCE_DIR, "demo.txt")
        demo_text = (
            "This is a demo knowledge file.\n"
            "Add your project or domain documentation as .txt or .md files here."
        )
        with open(demo_path, "w", encoding="utf-8") as f:
            f.write(demo_text)
        files.append(("demo.txt", demo_text))
    return files


def chunk_text(text: str, max_chars: int = 1200, overlap: int = 150):
    """Simple character-based chunking with overlap."""
    text = text.strip()
    if not text:
        return []
    chunks = []
    start = 0
    while start < len(text):
        end = min(len(text), start + max_chars)
        chunk = text[start:end]
        chunks.append(chunk.strip())
        if end >= len(text):
            break
        start = end - overlap
        if start < 0:
            start = 0
    return chunks


def build_chunks():
    """Create chunk objects suitable for embedding."""
    all_files = read_source_files()
    chunks = []
    idx = 0
    for filename, content in all_files:
        parts = chunk_text(content)
        for part in parts:
            chunks.append({"id": idx, "source": filename, "content": part})
            idx += 1
    return chunks


def write_jsonl(path: str, records):
    with open(path, "w", encoding="utf-8") as f:
        for r in records:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")


def write_config(path: str):
    """Write system + base prompt config file (list with single object)."""
    obj = [{"system_prompt": SYSTEM_PROMPT, "base_chunk": BASE_CHUNK}]
    with open(path, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)


def main():
    pdf_folder = r"C:\Users\kogut\Python\Assembler_rag\data\pdfs"
    prg_folder = r"C:\Users\kogut\Python\Assembler_rag\data\prg"
    # pdf_folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./data/pdfs")
    # prg_folder = Path(sys.argv[2]) if len(sys.argv) > 2 else None
    output_jsonl = "output_chunks.jsonl"

    all_chunks = process_pdf_folder(pdf_folder)

    if prg_folder:
        all_chunks += process_prg_folder(prg_folder)

    with open(output_jsonl, "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=2)

    print(f"Finished. {len(all_chunks)} total chunks written to {output_jsonl}")

    print(f"Generating RAG data from: {SOURCE_DIR}")
    chunks = build_chunks()
    print(f"Built {len(chunks)} chunks")
    write_jsonl(OUTPUT_CHUNKS_FILE, chunks)
    write_config(RAG_CONFIG_FILE)
    print(f"Wrote chunks to: {OUTPUT_CHUNKS_FILE}")
    print(f"Wrote config to: {RAG_CONFIG_FILE}")
    print("Done.")


if __name__ == "__main__":
    main()