-
Notifications
You must be signed in to change notification settings - Fork 26
Expand file tree
/
Copy pathTFIDF.cs
More file actions
285 lines (244 loc) · 10.4 KB
/
TFIDF.cs
File metadata and controls
285 lines (244 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
using EnglishStemmer;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.Serialization.Formatters.Binary;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading.Tasks;
namespace TFIDFExample
{
/// <summary>
/// Copyright (c) 2013 Kory Becker http://www.primaryobjects.com/kory-becker.aspx
///
/// Permission is hereby granted, free of charge, to any person obtaining
/// a copy of this software and associated documentation files (the
/// "Software"), to deal in the Software without restriction, including
/// without limitation the rights to use, copy, modify, merge, publish,
/// distribute, sublicense, and/or sell copies of the Software, and to
/// permit persons to whom the Software is furnished to do so, subject to
/// the following conditions:
///
/// The above copyright notice and this permission notice shall be
/// included in all copies or substantial portions of the Software.
///
/// Description:
/// Performs a TF*IDF (Term Frequency * Inverse Document Frequency) transformation on an array of documents.
/// Each document string is transformed into an array of doubles, cooresponding to their associated TF*IDF values.
///
/// Usage:
/// string[] documents = LoadYourDocuments();
///
/// double[][] inputs = TFIDF.Transform(documents);
/// inputs = TFIDF.Normalize(inputs);
///
/// </summary>
public static class TFIDF
{
/// <summary>
/// Document vocabulary, containing each word's IDF value.
/// </summary>
private static Dictionary<string, double> _vocabularyIDF = new Dictionary<string, double>();
/// <summary>
/// Transforms a list of documents into their associated TF*IDF values.
/// If a vocabulary does not yet exist, one will be created, based upon the documents' words.
/// </summary>
/// <param name="documents">string[]</param>
/// <param name="vocabularyThreshold">Minimum number of occurences of the term within all documents</param>
/// <returns>double[][]</returns>
public static double[][] Transform(string[] documents, int vocabularyThreshold = 3)
{
List<List<string>> stemmedDocs;
List<string> vocabulary;
// Get the vocabulary and stem the documents at the same time.
vocabulary = GetVocabulary(documents, out stemmedDocs, vocabularyThreshold);
if (_vocabularyIDF.Count == 0)
{
// Calculate the IDF for each vocabulary term.
foreach (var term in vocabulary)
{
double numberOfDocsContainingTerm = stemmedDocs.Where(d => d.Contains(term)).Count();
_vocabularyIDF[term] = Math.Log((double)stemmedDocs.Count / ((double)1 + numberOfDocsContainingTerm));
}
}
// Transform each document into a vector of tfidf values.
return TransformToTFIDFVectors(stemmedDocs, _vocabularyIDF);
}
/// <summary>
/// Converts a list of stemmed documents (lists of stemmed words) and their associated vocabulary + idf values, into an array of TF*IDF values.
/// </summary>
/// <param name="stemmedDocs">List of List of string</param>
/// <param name="vocabularyIDF">Dictionary of string, double (term, IDF)</param>
/// <returns>double[][]</returns>
private static double[][] TransformToTFIDFVectors(List<List<string>> stemmedDocs, Dictionary<string, double> vocabularyIDF)
{
// Transform each document into a vector of tfidf values.
List<List<double>> vectors = new List<List<double>>();
foreach (var doc in stemmedDocs)
{
List<double> vector = new List<double>();
foreach (var vocab in vocabularyIDF)
{
// Term frequency = count how many times the term appears in this document.
double tf = doc.Where(d => d == vocab.Key).Count();
double tfidf = tf * vocab.Value;
vector.Add(tfidf);
}
vectors.Add(vector);
}
return vectors.Select(v => v.ToArray()).ToArray();
}
/// <summary>
/// Normalizes a TF*IDF array of vectors using L2-Norm.
/// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
/// </summary>
/// <param name="vectors">double[][]</param>
/// <returns>double[][]</returns>
public static double[][] Normalize(double[][] vectors)
{
// Normalize the vectors using L2-Norm.
List<double[]> normalizedVectors = new List<double[]>();
foreach (var vector in vectors)
{
var normalized = Normalize(vector);
normalizedVectors.Add(normalized);
}
return normalizedVectors.ToArray();
}
/// <summary>
/// Normalizes a TF*IDF vector using L2-Norm.
/// Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
/// </summary>
/// <param name="vectors">double[][]</param>
/// <returns>double[][]</returns>
public static double[] Normalize(double[] vector)
{
List<double> result = new List<double>();
double sumSquared = 0;
foreach (var value in vector)
{
sumSquared += value * value;
}
double SqrtSumSquared = Math.Sqrt(sumSquared);
foreach (var value in vector)
{
// L2-norm: Xi = Xi / Sqrt(X0^2 + X1^2 + .. + Xn^2)
result.Add(value / SqrtSumSquared);
}
return result.ToArray();
}
/// <summary>
/// Saves the TFIDF vocabulary to disk.
/// </summary>
/// <param name="filePath">File path</param>
public static void Save(string filePath = "vocabulary.dat")
{
// Save result to disk.
using (FileStream fs = new FileStream(filePath, FileMode.Create))
{
BinaryFormatter formatter = new BinaryFormatter();
formatter.Serialize(fs, _vocabularyIDF);
}
}
/// <summary>
/// Loads the TFIDF vocabulary from disk.
/// </summary>
/// <param name="filePath">File path</param>
public static void Load(string filePath = "vocabulary.dat")
{
// Load from disk.
using (FileStream fs = new FileStream(filePath, FileMode.Open))
{
BinaryFormatter formatter = new BinaryFormatter();
_vocabularyIDF = (Dictionary<string, double>)formatter.Deserialize(fs);
}
}
#region Private Helpers
/// <summary>
/// Parses and tokenizes a list of documents, returning a vocabulary of words.
/// </summary>
/// <param name="docs">string[]</param>
/// <param name="stemmedDocs">List of List of string</param>
/// <returns>Vocabulary (list of strings)</returns>
private static List<string> GetVocabulary(string[] docs, out List<List<string>> stemmedDocs, int vocabularyThreshold)
{
List<string> vocabulary = new List<string>();
Dictionary<string, int> wordCountList = new Dictionary<string, int>();
stemmedDocs = new List<List<string>>();
int docIndex = 0;
foreach (var doc in docs)
{
List<string> stemmedDoc = new List<string>();
docIndex++;
if (docIndex % 100 == 0)
{
Console.WriteLine("Processing " + docIndex + "/" + docs.Length);
}
string[] parts2 = Tokenize(doc);
List<string> words = new List<string>();
foreach (string part in parts2)
{
// Strip non-alphanumeric characters.
string stripped = Regex.Replace(part, "[^a-zA-Z0-9]", "");
if (!StopWords.stopWordsList.Contains(stripped.ToLower()))
{
try
{
var english = new EnglishWord(stripped);
string stem = english.Stem;
words.Add(stem);
if (stem.Length > 0)
{
// Build the word count list.
if (wordCountList.ContainsKey(stem))
{
wordCountList[stem]++;
}
else
{
wordCountList.Add(stem, 0);
}
stemmedDoc.Add(stem);
}
}
catch
{
}
}
}
stemmedDocs.Add(stemmedDoc);
}
// Get the top words.
var vocabList = wordCountList.Where(w => w.Value >= vocabularyThreshold);
foreach (var item in vocabList)
{
vocabulary.Add(item.Key);
}
return vocabulary;
}
/// <summary>
/// Tokenizes a string, returning its list of words.
/// </summary>
/// <param name="text">string</param>
/// <returns>string[]</returns>
private static string[] Tokenize(string text)
{
// Strip all HTML.
text = Regex.Replace(text, "<[^<>]+>", "");
// Strip numbers.
text = Regex.Replace(text, "[0-9]+", "number");
// Strip urls.
text = Regex.Replace(text, @"(http|https)://[^\s]*", "httpaddr");
// Strip email addresses.
text = Regex.Replace(text, @"[^\s]+@[^\s]+", "emailaddr");
// Strip dollar sign.
text = Regex.Replace(text, "[$]+", "dollar");
// Strip usernames.
text = Regex.Replace(text, @"@[^\s]+", "username");
// Tokenize and also get rid of any punctuation
return text.Split(" @$/#.-:&*+=[]?!(){},''\">_<;%\\".ToCharArray());
}
#endregion
}
}