-
Notifications
You must be signed in to change notification settings - Fork 215
Expand file tree
/
Copy pathcsv_example.py
More file actions
180 lines (142 loc) · 5.73 KB
/
csv_example.py
File metadata and controls
180 lines (142 loc) · 5.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/python
"""
This code demonstrates how to use dedupe with a comma separated values
(CSV) file. All operations are performed in memory, so will run very
quickly on datasets up to ~10,000 rows.
We start with a CSV file containing our messy data. In this example,
it is listings of early childhood education centers in Chicago
compiled from several different sources.
The output will be a CSV with our clustered results.
For larger datasets, see our [mysql_example](mysql_example.html)
"""
import csv
import logging
import optparse
import os
import re
import dedupe
from unidecode import unidecode
def preProcess(column):
"""
Do a little bit of data cleaning with the help of Unidecode and Regex.
Things like casing, extra spaces, quotes and new lines can be ignored.
"""
column = unidecode(column)
column = re.sub(" +", " ", column)
column = re.sub("\n", " ", column)
column = column.strip().strip('"').strip("'").lower().strip()
# If data is missing, indicate that by setting the value to `None`
if not column:
column = None
return column
def readData(filename):
"""
Read in our data from a CSV file and create a dictionary of records,
where the key is a unique record ID and each value is dict
"""
data_d = {}
with open(filename) as f:
reader = csv.DictReader(f)
for row in reader:
clean_row = [(k, preProcess(v)) for (k, v) in row.items()]
row_id = int(row["Id"])
data_d[row_id] = dict(clean_row)
return data_d
if __name__ == "__main__":
# ## Logging
# Dedupe uses Python logging to show or suppress verbose output. This
# code block lets you change the level of loggin on the command
# line. You don't need it if you don't want that. To enable verbose
# logging, run `python examples/csv_example/csv_example.py -v`
optp = optparse.OptionParser()
optp.add_option(
"-v",
"--verbose",
dest="verbose",
action="count",
help="Increase verbosity (specify multiple times for more)",
)
(opts, args) = optp.parse_args()
log_level = logging.WARNING
if opts.verbose:
if opts.verbose == 1:
log_level = logging.INFO
elif opts.verbose >= 2:
log_level = logging.DEBUG
logging.basicConfig(level=log_level)
# ## Setup
input_file = "csv_example_messy_input.csv"
output_file = "csv_example_output.csv"
settings_file = "csv_example_learned_settings"
training_file = "csv_example_training.json"
print("importing data ...")
data_d = readData(input_file)
# If a settings file already exists, we'll just load that and skip training
if os.path.exists(settings_file):
print("reading from", settings_file)
with open(settings_file, "rb") as f:
deduper = dedupe.StaticDedupe(f)
else:
# ## Training
# Define the fields dedupe will pay attention to
fields = [
dedupe.variables.String("Site name"),
dedupe.variables.String("Address"),
dedupe.variables.Exact("Zip", has_missing=True),
dedupe.variables.String("Phone", has_missing=True),
]
# Create a new deduper object and pass our data model to it.
deduper = dedupe.Dedupe(fields)
# If we have training data saved from a previous run of dedupe,
# look for it and load it in.
# __Note:__ if you want to train from scratch, delete the training_file
if os.path.exists(training_file):
print("reading labeled examples from ", training_file)
with open(training_file, "rb") as f:
deduper.prepare_training(data_d, f)
else:
deduper.prepare_training(data_d)
# ## Active learning
# Dedupe will find the next pair of records
# it is least certain about and ask you to label them as duplicates
# or not.
# use 'y', 'n' and 'u' keys to flag duplicates
# press 'f' when you are finished
print("starting active labeling...")
dedupe.console_label(deduper)
# Using the examples we just labeled, train the deduper and learn
# blocking predicates
deduper.train()
# When finished, save our training to disk
with open(training_file, "w") as tf:
deduper.write_training(tf)
# Save our weights and predicates to disk. If the settings file
# exists, we will skip all the training and learning next time we run
# this file.
with open(settings_file, "wb") as sf:
deduper.write_settings(sf)
# ## Clustering
# `partition` will return sets of records that dedupe
# believes are all referring to the same entity.
print("clustering...")
clustered_dupes = deduper.partition(data_d, 0.5)
print("# duplicate sets", len(clustered_dupes))
# ## Writing Results
# Write our original data back out to a CSV with a new column called
# 'Cluster ID' which indicates which records refer to each other.
cluster_membership = {}
for cluster_id, (records, scores) in enumerate(clustered_dupes):
for record_id, score in zip(records, scores):
cluster_membership[record_id] = {
"Cluster ID": cluster_id,
"confidence_score": score,
}
with open(output_file, "w") as f_output, open(input_file) as f_input:
reader = csv.DictReader(f_input)
fieldnames = ["Cluster ID", "confidence_score"] + reader.fieldnames
writer = csv.DictWriter(f_output, fieldnames=fieldnames)
writer.writeheader()
for row in reader:
row_id = int(row["Id"])
row.update(cluster_membership[row_id])
writer.writerow(row)