Anirban0011's picture
upd
fcd2005
raw
history blame contribute delete
314 Bytes
import re
import string
from unidecode import unidecode
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words("english")) | set(stopwords.words("indonesian"))
def clean_text(text):
text = unidecode(text)
text = text.lower()
return text