Spaces:
Running
Running
| #RAG method | |
| from PyPDF2 import PdfReader | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.docstore.document import Document | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings | |
| from langchain_core.vectorstores import InMemoryVectorStore | |
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| hf_token = os.getenv("HF_TOKEN") | |
| def load_and_chunk_pdfs(directory_path): | |
| docs = [] | |
| for filename in os.listdir(directory_path): | |
| if filename.endswith(".pdf"): | |
| file_path = os.path.join(directory_path, filename) | |
| reader = PdfReader(file_path) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| doc = Document(page_content=text, metadata={"source": filename}) | |
| docs.append(doc) | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) | |
| chunked_docs = text_splitter.split_documents(docs) | |
| return chunked_docs | |
| def create_retriever(documents: list): | |
| """ | |
| Function to create and return a retriever using HuggingFace Embeddings and InMemory VectorStore. | |
| Args: | |
| api_key (str): Hugging Face API key. | |
| model_name (str): The model name for sentence transformer embeddings. | |
| documents (list): The list of documents to be embedded and added to the vectorstore. | |
| Returns: | |
| retriever: A retriever object to query the vector store. | |
| """ | |
| embeddings = HuggingFaceInferenceAPIEmbeddings(api_key=hf_token, model_name="sentence-transformers/all-MiniLM-l6-v2") | |
| vectorstore = InMemoryVectorStore(embedding=embeddings) | |
| vectorstore.add_documents(documents) | |
| return vectorstore.as_retriever() | |