Spaces:
Sleeping
Sleeping
| import re | |
| import fitz # PyMuPDF | |
| import pandas as pd | |
| from collections import Counter | |
| from sklearn.cluster import KMeans | |
| from sklearn.metrics import silhouette_score | |
| from sentence_transformers import SentenceTransformer | |
| from nltk.corpus import stopwords | |
| import nltk | |
| import gradio as gr | |
| # ---------------------------- | |
| # π¦ Setup | |
| # ---------------------------- | |
| nltk.download('stopwords', quiet=True) | |
| STOPWORDS = set(stopwords.words('english')) | |
| # ---------------------------- | |
| # π PDF Text Extraction | |
| # ---------------------------- | |
| def extract_text_from_pdf(pdf_file): | |
| """Extract text from uploaded PDF file""" | |
| text = "" | |
| with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc: | |
| for page in doc: | |
| text += page.get_text("text") | |
| return text.strip() | |
| # ---------------------------- | |
| # π§Ή Text Cleaning | |
| # ---------------------------- | |
| def clean_text(text): | |
| """Clean and remove stopwords""" | |
| text = re.sub(r"[^a-zA-Z ]", " ", text) | |
| words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2] | |
| return words | |
| # ---------------------------- | |
| # π€ Topic Modeling Function | |
| # ---------------------------- | |
| def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5): | |
| """Cluster sentences into topics using transformer embeddings""" | |
| model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot') | |
| embeddings = model.encode(sentences, show_progress_bar=False) | |
| # --- Auto-select topic number --- | |
| if auto_topics: | |
| if len(sentences) < 3: | |
| num_topics = 1 | |
| else: | |
| scores = [] | |
| for k in range(2, min(max_k, len(sentences))): | |
| kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings) | |
| try: | |
| score = silhouette_score(embeddings, kmeans.labels_) | |
| scores.append((k, score)) | |
| except: | |
| continue | |
| num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2 | |
| else: | |
| num_topics = fixed_k | |
| # --- Clustering --- | |
| kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10) | |
| kmeans.fit(embeddings) | |
| df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_}) | |
| # --- Build topic summaries --- | |
| topic_data = [] | |
| for topic_id in range(num_topics): | |
| topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist() | |
| words = [] | |
| for s in topic_sentences: | |
| words.extend(clean_text(s)) | |
| word_freq = Counter(words) | |
| top_words = [w for w, _ in word_freq.most_common(3)] | |
| title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous" | |
| examples = topic_sentences[:3] | |
| topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples))) | |
| return topic_data, num_topics | |
| # ---------------------------- | |
| # π Gradio Interface Logic | |
| # ---------------------------- | |
| def analyze_input(pdf_file, essay_text): | |
| pdf_text = "" | |
| if pdf_file: | |
| pdf_text = extract_text_from_pdf(pdf_file) | |
| full_text = (pdf_text + "\n" + (essay_text or "")).strip() | |
| if not full_text: | |
| return "β Please upload a PDF or write an essay." | |
| sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20] | |
| if len(sentences) < 2: | |
| return "β οΈ Not enough text for topic modeling." | |
| topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True) | |
| # --- Display output --- | |
| output_text = f"β **Detected {num_topics} Topics:**\n\n" | |
| for title, examples in topic_data: | |
| output_text += f"### {title}\n{examples}\n\n" | |
| return output_text | |
| # ---------------------------- | |
| # π¨ Gradio Interface | |
| # ---------------------------- | |
| demo = gr.Interface( | |
| fn=analyze_input, | |
| inputs=[ | |
| gr.File(label="π Upload PDF (optional)"), | |
| gr.Textbox(lines=10, placeholder="βοΈ Write or paste your essay here...", label="Essay Text") | |
| ], | |
| outputs=gr.Markdown(label="π§ Detected Topics"), | |
| title="PDF + Essay Topic Discovery (Transformer-Based)", | |
| description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |