Spaces:
Sleeping
Sleeping
File size: 1,392 Bytes
622a0b7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# models/keyword_extractor.py
import spacy
from collections import Counter
class KeywordExtractor:
def __init__(self): # Corrected __init__
try:
self.nlp = spacy.load("en_core_web_sm")
except OSError:
print("Please install spaCy English model: python -m spacy download en_core_web_sm")
raise
def extract_keywords(self, text):
"""Extract keywords and named entities from text"""
doc = self.nlp(text)
keywords = []
# Extract named entities
for ent in doc.ents:
if ent.label_ in ['PERSON', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'DATE']:
keywords.append(ent.text)
# Extract noun phrases and important words
for chunk in doc.noun_chunks:
if len(chunk.text.split()) <= 3: # Avoid very long phrases
keywords.append(chunk.text)
# Extract individual important words
for token in doc:
if (token.pos_ in ['NOUN', 'PROPN'] and
not token.is_stop and
not token.is_punct and
len(token.text) > 2):
keywords.append(token.text)
# Remove duplicates and return most common
keyword_counts = Counter(keywords)
return [word for word, count in keyword_counts.most_common(10)]
|