Spaces:

Mooo-osama03
/

Topicclassification

Sleeping

App Files Files Community

Topicclassification / app.py

Mooo-osama03

Upload 2 files

b94af5b verified 2 months ago

raw

history blame

4.47 kB

	import re
	import fitz # PyMuPDF
	import pandas as pd
	from collections import Counter
	from sklearn.cluster import KMeans
	from sklearn.metrics import silhouette_score
	from sentence_transformers import SentenceTransformer
	from nltk.corpus import stopwords
	import nltk
	import gradio as gr

	# ----------------------------
	# 📦 Setup
	# ----------------------------
	nltk.download('stopwords', quiet=True)
	STOPWORDS = set(stopwords.words('english'))

	# ----------------------------
	# 📘 PDF Text Extraction
	# ----------------------------
	def extract_text_from_pdf(pdf_file):
	"""Extract text from uploaded PDF file"""
	text = ""
	with fitz.open(stream=pdf_file.read(), filetype="pdf") as doc:
	for page in doc:
	text += page.get_text("text")
	return text.strip()

	# ----------------------------
	# 🧹 Text Cleaning
	# ----------------------------
	def clean_text(text):
	"""Clean and remove stopwords"""
	text = re.sub(r"[^a-zA-Z ]", " ", text)
	words = [w.lower() for w in text.split() if w.lower() not in STOPWORDS and len(w) > 2]
	return words

	# ----------------------------
	# 🤖 Topic Modeling Function
	# ----------------------------
	def transformer_topic_modeling(sentences, auto_topics=True, max_k=8, fixed_k=5):
	"""Cluster sentences into topics using transformer embeddings"""
	model = SentenceTransformer('flax-sentence-embeddings/multi-qa_v1-distilbert-cls_dot')
	embeddings = model.encode(sentences, show_progress_bar=False)

	# --- Auto-select topic number ---
	if auto_topics:
	if len(sentences) < 3:
	num_topics = 1
	else:
	scores = []
	for k in range(2, min(max_k, len(sentences))):
	kmeans = KMeans(n_clusters=k, random_state=42, n_init=10).fit(embeddings)
	try:
	score = silhouette_score(embeddings, kmeans.labels_)
	scores.append((k, score))
	except:
	continue
	num_topics = max(scores, key=lambda x: x[1])[0] if scores else 2
	else:
	num_topics = fixed_k

	# --- Clustering ---
	kmeans = KMeans(n_clusters=num_topics, random_state=42, n_init=10)
	kmeans.fit(embeddings)
	df = pd.DataFrame({"Sentence": sentences, "Topic": kmeans.labels_})

	# --- Build topic summaries ---
	topic_data = []
	for topic_id in range(num_topics):
	topic_sentences = df[df["Topic"] == topic_id]["Sentence"].tolist()
	words = []
	for s in topic_sentences:
	words.extend(clean_text(s))
	word_freq = Counter(words)
	top_words = [w for w, _ in word_freq.most_common(3)]
	title = " & ".join(top_words).capitalize() if top_words else "Miscellaneous"
	examples = topic_sentences[:3]
	topic_data.append((f"Topic {topic_id + 1}: {title}", "\n".join(examples)))

	return topic_data, num_topics

	# ----------------------------
	# 🚀 Gradio Interface Logic
	# ----------------------------
	def analyze_input(pdf_file, essay_text):
	pdf_text = ""
	if pdf_file:
	pdf_text = extract_text_from_pdf(pdf_file)

	full_text = (pdf_text + "\n" + (essay_text or "")).strip()
	if not full_text:
	return "❌ Please upload a PDF or write an essay."

	sentences = [s.strip() for s in re.split(r'[.!?]', full_text) if len(s.strip()) > 20]
	if len(sentences) < 2:
	return "⚠️ Not enough text for topic modeling."

	topic_data, num_topics = transformer_topic_modeling(sentences, auto_topics=True)

	# --- Display output ---
	output_text = f"✅ Detected {num_topics} Topics:\n\n"
	for title, examples in topic_data:
	output_text += f"### {title}\n{examples}\n\n"

	return output_text

	# ----------------------------
	# 🎨 Gradio Interface
	# ----------------------------
	demo = gr.Interface(
	fn=analyze_input,
	inputs=[
	gr.File(label="📂 Upload PDF (optional)"),
	gr.Textbox(lines=10, placeholder="✍️ Write or paste your essay here...", label="Essay Text")
	],
	outputs=gr.Markdown(label="🧠 Detected Topics"),
	title="PDF + Essay Topic Discovery (Transformer-Based)",
	description="Upload a PDF and/or write an essay. The system identifies and summarizes main topics using transformer embeddings."
	)

	if __name__ == "__main__":
	demo.launch()