Spaces:
Sleeping
Sleeping
File size: 6,433 Bytes
7ba3a81 a840639 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 |
import google.generativeai as genai
from typing import List
from pathlib import Path
import fitz
import json
import os
import textwrap
from settings import Chunk, Settings
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
# Output artifact locations (align with api.py expectations)
OUTPUT_CHUNKS_FILE = os.path.join(
SCRIPT_DIR, "output_chunks.jsonl"
) # already used in api.py
RAG_CONFIG_FILE = os.path.join(
SCRIPT_DIR, "rag_prompt_config.jsonl"
) # already used in api.py
# If you also want these in data/ instead, uncomment:
# OUTPUT_CHUNKS_FILE = os.path.join(DATA_DIR, "output_chunks.jsonl")
# RAG_CONFIG_FILE = os.path.join(DATA_DIR, "rag_prompt_config.jsonl")
# Example system / base prompts (edit as needed)
SYSTEM_PROMPT = {
"role": "system",
"content": "You are a helpful RAG assistant. Use only the provided context. If unsure, say you don't know.",
}
BASE_CHUNK = {
"role": "base",
"content": "Answer the user's query using only the contextual chunks below.",
}
def extract_pdf_text(filename: str) -> str:
text = ""
with fitz.open(filename) as doc:
for page in doc:
text += page.get_text()
return text
def chunk_pdf(filename: str) -> List[Chunk]:
client = genai.Client()
text = extract_pdf_text(filename)
# print(text)
pdf_name = Path(filename).name
prompt = f"""
Split the following text into coherent chunks suitable for RAG.
Each chunk should be 100-500 words.
Do not cut mid-sentence, paragraph, or table.
Preserve headings, bullet points, and tables.
Return an array of JSON objects with this structure:
{{
"content": "<chunk text>",
"source": "{pdf_name}",
"tags": [],
"type": "prg"
}}
Text:
{text}
"""
client = genai.Client()
response = client.models.generate_content(
model="gemini-2.5-flash",
contents=prompt,
config={
"response_mime_type": "application/json",
"response_schema": Settings.response_schema,
},
)
chunks: List[Chunk] = response.parsed
return chunks
def process_pdf_folder(folder_path):
folder = Path(folder_path)
pdfs = list(folder.glob("*.pdf"))
all_chunks = []
if not pdfs:
print(f"No PDF files found in {folder_path}")
return []
else:
pdfs.sort(key=lambda x: x.name)
for pdf_file in pdfs:
print(f"Processing PDF: {pdf_file.name}")
chunks = chunk_pdf(filename=pdf_file)
all_chunks.extend(chunks)
return all_chunks
def make_prg_chunk(text, filename):
return [
{
"content": text.strip(),
"source": Path(filename).name,
"tags": [],
"type": "prg",
}
]
def process_prg_folder(folder_path):
folder = Path(folder_path)
all_chunks = []
prgs = list(folder.glob("*.prg"))
if not prgs:
print(f"No .prg files found in {folder_path}")
return []
prgs.sort(key=lambda x: x.name)
for prg_file in prgs:
print(f"Processing PRG: {prg_file.name}")
text = prg_file.read_text(encoding="utf-8", errors="ignore")
chunk = make_prg_chunk(text, prg_file.name)
all_chunks.extend(chunk)
return all_chunks
def read_source_files():
"""Load all .txt / .md files from SOURCE_DIR."""
files = []
for name in os.listdir(SOURCE_DIR):
if name.lower().endswith((".txt", ".md")):
path = os.path.join(SOURCE_DIR, name)
with open(path, "r", encoding="utf-8") as f:
files.append((name, f.read()))
if not files:
# Provide a fallback demo file if none exist
demo_path = os.path.join(SOURCE_DIR, "demo.txt")
demo_text = (
"This is a demo knowledge file.\n"
"Add your project or domain documentation as .txt or .md files here."
)
with open(demo_path, "w", encoding="utf-8") as f:
f.write(demo_text)
files.append(("demo.txt", demo_text))
return files
def chunk_text(text: str, max_chars: int = 1200, overlap: int = 150):
"""Simple character-based chunking with overlap."""
text = text.strip()
if not text:
return []
chunks = []
start = 0
while start < len(text):
end = min(len(text), start + max_chars)
chunk = text[start:end]
chunks.append(chunk.strip())
if end >= len(text):
break
start = end - overlap
if start < 0:
start = 0
return chunks
def build_chunks():
"""Create chunk objects suitable for embedding."""
all_files = read_source_files()
chunks = []
idx = 0
for filename, content in all_files:
parts = chunk_text(content)
for part in parts:
chunks.append({"id": idx, "source": filename, "content": part})
idx += 1
return chunks
def write_jsonl(path: str, records):
with open(path, "w", encoding="utf-8") as f:
for r in records:
f.write(json.dumps(r, ensure_ascii=False) + "\n")
def write_config(path: str):
"""Write system + base prompt config file (list with single object)."""
obj = [{"system_prompt": SYSTEM_PROMPT, "base_chunk": BASE_CHUNK}]
with open(path, "w", encoding="utf-8") as f:
json.dump(obj, f, ensure_ascii=False, indent=2)
def main():
pdf_folder = r"C:\Users\kogut\Python\Assembler_rag\data\pdfs"
prg_folder = r"C:\Users\kogut\Python\Assembler_rag\data\prg"
# pdf_folder = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("./data/pdfs")
# prg_folder = Path(sys.argv[2]) if len(sys.argv) > 2 else None
output_jsonl = "output_chunks.jsonl"
all_chunks = process_pdf_folder(pdf_folder)
if prg_folder:
all_chunks += process_prg_folder(prg_folder)
with open(output_jsonl, "w", encoding="utf-8") as f:
json.dump(all_chunks, f, ensure_ascii=False, indent=2)
print(f"Finished. {len(all_chunks)} total chunks written to {output_jsonl}")
print(f"Generating RAG data from: {SOURCE_DIR}")
chunks = build_chunks()
print(f"Built {len(chunks)} chunks")
write_jsonl(OUTPUT_CHUNKS_FILE, chunks)
write_config(RAG_CONFIG_FILE)
print(f"Wrote chunks to: {OUTPUT_CHUNKS_FILE}")
print(f"Wrote config to: {RAG_CONFIG_FILE}")
print("Done.")
if __name__ == "__main__":
main()
|