import re
parentheticals = [ r"\(laughter\)", r"\(applause\)",
r"\(music\)", r"\(video\)" ]
def clean_parens(text):
new_text = text
for rgx_match in parentheticals:
new_text = re.sub(rgx_match, ' ', new_text, flags=re.IGNORECASE)
return new_text
from sklearn.feature_extraction.text import CountVectorizer
CountVectorizer()
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.',
'This document is the second document.',
'And this is the third one. (laughter)',
'Is this the first document?']
vectorizer = CountVectorizer(preprocessor=clean_parens)
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
# ['And', 'Is', 'This', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
Describe the bug
This is based on the SO post here.
Shouldn't we have to raise an error when
lowercase==Truewhenpreprocessoris not None since we are not going to apply the lower casing when processor is callable?Steps/Code to Reproduce: