generated from fastai/nbdev_template
-
Notifications
You must be signed in to change notification settings - Fork 29
Expand file tree
/
Copy pathmulticlass_classification.py
More file actions
82 lines (69 loc) · 3.29 KB
/
multiclass_classification.py
File metadata and controls
82 lines (69 loc) · 3.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# AUTOGENERATED! DO NOT EDIT! File to edit: 11_multiclass_classification.ipynb (unless otherwise specified).
__all__ = ['load_data', 'clean_text', 'preprocess_corpus', 'encode_labels', 'compute_tfidf', 'train_test_model']
# Cell
import os
import time
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
# Cell
def load_data(file_path):
""" Load the csv file and return a Dataframe. """
df = pd.read_csv(file_path, delimiter=',')
return df
def clean_text(text):
""" clean the text by removing special characters, punctuations, etc. """
text = text.lower()
remove_xx_seq = re.compile('xx+')
replace_by_space = re.compile('[/(){}\[\]\|@,;]')
remove_special_characters = re.compile('[^0-9a-z +]')
remove_extra_space = re.compile('[^A-Za-z0-9]+')
remove_numbers = re.compile('[0-9]+')
text = re.sub(remove_xx_seq, ' ', text)
text = re.sub(replace_by_space, ' ', text)
text = re.sub(remove_special_characters, ' ', text)
text = re.sub(remove_numbers, ' ', text)
text = re.sub(remove_extra_space, ' ', text)
return text.strip()
def preprocess_corpus(df, column='text'):
""" Preprocess the entire corpus including cleaning the text documents and return the updated dataframe. """
df[column] = df[column].apply(clean_text)
return df
def encode_labels(labels):
""" Encode the class labels into a numbers. """
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(labels)
return label_encoder
def compute_tfidf(corpus, stop_words='english', ngram_range=(1, 1), max_features=None):
""" Calculate the tfidf features for all the text documents and return a (documents, fatures) matrix. """
vectorizer = TfidfVectorizer(input='content', stop_words=stop_words, ngram_range=ngram_range, min_df=3, max_df=0.9, max_features=max_features)
print('Computing tfidf features...', end='')
X = vectorizer.fit_transform(corpus)
print('done!')
return X, vectorizer
def train_test_model(model, X_train, X_test, y_train, y_test,labels):
""" Train and test the model using the training and test data sets. Return the predictions, accuracy and metric reports. """
print('Start training...', end='')
model.fit(X_train, y_train)
print('done!')
print('Start testing...', end='')
predictions = model.predict(X_test)
accuracy = model.score(X_test, y_test)
metrics_report = classification_report(y_test, predictions, target_names=labels)
precision, recall, fscore, train_support = score(y_test, predictions, average='weighted')
print('done!')
return predictions, accuracy, metrics_report, (precision, recall, fscore)