-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbenchmark_count_vectorizer.py
More file actions
56 lines (39 loc) · 1.52 KB
/
benchmark_count_vectorizer.py
File metadata and controls
56 lines (39 loc) · 1.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import timeit
import pandas as pd
from sklearn.feature_extraction.text import (CountVectorizer, TfidfVectorizer,
HashingVectorizer)
from sklearn.datasets import fetch_20newsgroups
from tqdm import tqdm
import itertools
import numpy as np
text = fetch_20newsgroups(subset='all').data
def pandas_df_to_markdown_table(df):
fmt = ['---' for i in range(len(df.columns))]
df_fmt = pd.DataFrame([fmt], columns=df.columns)
df_formatted = pd.concat([df_fmt, df])
print(df_formatted.to_csv(sep="|", index=False))
res = []
def run_vectorizer(Vectorizer, X, **params):
def f():
vect = Vectorizer(**params)
vect.fit_transform(X)
return f
for Vectorizer, (analyzer, ngram_range) in tqdm(itertools.product(
[CountVectorizer, TfidfVectorizer, HashingVectorizer],
[('word', (1, 1)),
('word', (1, 2)),
('word', (1, 4)),
('char', (4, 4)),
('char_wb', (4, 4))
])):
bench = {'vectorizer': Vectorizer.__name__}
params = {'analyzer': analyzer, 'ngram_range': ngram_range}
bench.update(params)
dt = timeit.repeat(run_vectorizer(Vectorizer, text, **params),
number=1,
repeat=3)
bench['time'] = "{:.2f}(±{:.2f})".format(np.mean(dt), np.std(dt))
res.append(bench)
df = pd.DataFrame(res).set_index(['analyzer', 'ngram_range', 'vectorizer'])
df = df['time'].unstack(level=2).reset_index()
pandas_df_to_markdown_table(df)