Spaces:
Sleeping
Sleeping
File size: 4,465 Bytes
b94af5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import re
import fitz # PyMuPDF
import pandas as pd
from collections import Counter
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import nltk
import gradio as gr
# ----------------------------
# π¦ Setup
# ----------------------------
nltk.download('stopwords', quiet=True)
STOPWORDS = set(stopwords.words('english'))
# ----------------------------
# π PDF Text Extraction
# ----------------------------
def extract_text_from_pdf(pdf_file):
"""Extract text from uploaded PDF file"""
text = ""
with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
for page in doc:
text += page.get_text("text")
return text.strip()
# ----------------------------
# π§Ή Text Cleaning
# ----------------------------
def clean_text(text):
"""Clean and remove stopwords"""
text = re.sub(r"[^a-zA-Z ]", " ", text)
words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
return words
# ----------------------------
# π€ Topic Modeling Function
# ----------------------------
def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
"""Cluster sentences into topics using transformer embeddings"""
model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
embeddings = model.encode(sentences, show_progress_bar=False)
# --- Auto-select topic number ---
if auto_topics:
if len(sentences) < 3:
num_topics = 1
else:
scores = []
for k in range(2, min(max_k, len(sentences))):
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
try:
score = silhouette_score(embeddings, kmeans.labels_)
scores.append((k, score))
except:
continue
num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
else:
num_topics = fixed_k
# --- Clustering ---
kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
kmeans.fit(embeddings)
df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})
# --- Build topic summaries ---
topic_data = []
for topic_id in range(num_topics):
topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
words = []
for s in topic_sentences:
words.extend(clean_text(s))
word_freq = Counter(words)
top_words = [w for w, _ in word_freq.most_common(3)]
title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
examples = topic_sentences[:3]
topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))
return topic_data, num_topics
# ----------------------------
# π Gradio Interface Logic
# ----------------------------
def analyze_input(pdf_file, essay_text):
pdf_text = ""
if pdf_file:
pdf_text = extract_text_from_pdf(pdf_file)
full_text = (pdf_text + "\n" + (essay_text or "")).strip()
if not full_text:
return "β Please upload a PDF or write an essay."
sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
if len(sentences) < 2:
return "β οΈ Not enough text for topic modeling."
topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)
# --- Display output ---
output_text = f"β
**Detected {num_topics} Topics:**\n\n"
for title, examples in topic_data:
output_text += f"### {title}\n{examples}\n\n"
return output_text
# ----------------------------
# π¨ Gradio Interface
# ----------------------------
demo = gr.Interface(
fn=analyze_input,
inputs=[
gr.File(label="π Upload PDF (optional)"),
gr.Textbox(lines=10, placeholder="βοΈ Write or paste your essay here...", label="Essay Text")
],
outputs=gr.Markdown(label="π§ Detected Topics"),
title="PDF + Essay Topic Discovery (Transformer-Based)",
description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
)
if __name__ == "__main__":
demo.launch()
|