Mooo-osama03's picture
Upload 2 files
b94af5b verified
raw
history blame
4.47 kB
import re
import fitz # PyMuPDF
import pandas as pd
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import nltk
import gradio as gr
# ----------------------------
# πŸ“¦ Setup
# ----------------------------
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))
# ----------------------------
# πŸ“˜ PDF Text Extraction
# ----------------------------
def extract_text_from_pdf(pdf_file):
"""Extract text from uploaded PDF file"""
text = ""
with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
for page in doc:
text += page.get_text("text")
return text.strip()
# ----------------------------
# 🧹 Text Cleaning
# ----------------------------
def clean_text(text):
"""Clean and remove stopwords"""
text = re.sub(r"[^a-zA-Z ]", " ", text)
words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
return words
# ----------------------------
# πŸ€– Topic Modeling Function
# ----------------------------
def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
"""Cluster sentences into topics using transformer embeddings"""
model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
embeddings = model.encode(sentences, show_progress_bar=False)
# --- Auto-select topic number ---
if auto_topics:
if len(sentences) < 3:
num_topics = 1
else:
scores = []
for k in range(2, min(max_k, len(sentences))):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
try:
score = silhouette_score(embeddings, kmeans.labels_)
scores.append((k, score))
except:
continue
num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
else:
num_topics = fixed_k
# --- Clustering ---
kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
kmeans.fit(embeddings)
df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})
# --- Build topic summaries ---
topic_data = []
for topic_id in range(num_topics):
topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
words = []
for s in topic_sentences:
words.extend(clean_text(s))
word_freq = Counter(words)
top_words = [w for w, _ in word_freq.most_common(3)]
title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
examples = topic_sentences[:3]
topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))
return topic_data, num_topics
# ----------------------------
# πŸš€ Gradio Interface Logic
# ----------------------------
def analyze_input(pdf_file, essay_text):
pdf_text = ""
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file)
full_text = (pdf_text + "\n" + (essay_text or "")).strip()
if not full_text:
return "❌ Please upload a PDF or write an essay."
sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
if len(sentences) < 2:
return "⚠️ Not enough text for topic modeling."
topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
# --- Display output ---
output_text = f"βœ… **Detected {num_topics} Topics:**\n\n"
for title, examples in topic_data:
output_text += f"### {title}\n{examples}\n\n"
return output_text
# ----------------------------
# 🎨 Gradio Interface
# ----------------------------
demo = gr.Interface(
fn=analyze_input,
inputs=[
gr.File(label="πŸ“‚ Upload PDF (optional)"),
gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
],
outputs=gr.Markdown(label="🧠 Detected Topics"),
title="PDF + Essay Topic Discovery (Transformer-Based)",
description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
)
if __name__ == "__main__":
demo.launch()