-
Notifications
You must be signed in to change notification settings - Fork 24
Expand file tree
/
Copy pathmain.go
More file actions
323 lines (255 loc) · 9.79 KB
/
main.go
File metadata and controls
323 lines (255 loc) · 9.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
// Gedcomdiff is a tool for comparing GEDCOM files and producing a HTML report.
//
// Usage
//
// gedcomdiff -left-gedcom file1.ged -right-gedcom file2.ged -output out.html
//
// For a complete list of options use:
//
// gedcomdiff -help
//
package main
import (
"flag"
"fmt"
"github.com/cheggaaa/pb"
"github.com/elliotchance/gedcom"
"github.com/elliotchance/gedcom/html"
"github.com/elliotchance/gedcom/util"
"log"
"os"
"os/signal"
"time"
)
var (
optionLeftGedcomFile string
optionRightGedcomFile string
optionOutputFile string
optionShow string // see optionShow constants.
optionGoogleAnalyticsID string
optionProgress bool
optionJobs int
optionMinimumSimilarity float64
optionMinimumWeightedSimilarity float64
optionSort string // see optionSort constants.
optionPreferPointerAbove float64
optionAllowMultiLine bool
optionAllowInvalidIndents bool
)
var filterFlags = &gedcom.FilterFlags{}
func check(err error) {
if err != nil {
log.Fatal(err)
}
}
func newDocumentFromGEDCOMFile(path string) (*gedcom.Document, error) {
file, err := os.Open(path)
if err != nil {
return nil, err
}
decoder := gedcom.NewDecoder(file)
decoder.AllowMultiLine = optionAllowMultiLine
decoder.AllowInvalidIndents = optionAllowInvalidIndents
return decoder.Decode()
}
func main() {
parseCLIFlags()
similarityOptions := gedcom.NewSimilarityOptions()
similarityOptions.MinimumWeightedSimilarity = optionMinimumWeightedSimilarity
similarityOptions.PreferPointerAbove = optionPreferPointerAbove
similarityOptions.MinimumSimilarity = optionMinimumSimilarity
compareOptions := gedcom.NewIndividualNodesCompareOptions()
compareOptions.SimilarityOptions = similarityOptions
compareOptions.Notifier = make(chan gedcom.Progress)
compareOptions.NotifierStep = 100
compareOptions.Jobs = optionJobs
// Gracefully handle a ctrl+c.
signaller := make(chan os.Signal, 1)
signal.Notify(signaller, os.Interrupt)
go func() {
<-signaller
defer func() {
// The panic may occur if crtl+c happens after the comparisons after
// the comparisons are finished but it's still rendering the output.
//
// We tried our best, exit with a failure code.
recover()
log.Fatal("aborted")
}()
// This is the correct way to abort the comparisons.
close(compareOptions.Notifier)
}()
leftGedcom, err := newDocumentFromGEDCOMFile(optionLeftGedcomFile)
check(err)
rightGedcom, err := newDocumentFromGEDCOMFile(optionRightGedcomFile)
check(err)
// Run compare.
leftIndividuals := leftGedcom.Individuals()
rightIndividuals := rightGedcom.Individuals()
out, err := os.Create(optionOutputFile)
check(err)
var comparisons gedcom.IndividualComparisons
go func() {
comparisons = leftIndividuals.Compare(rightIndividuals, compareOptions)
}()
if optionProgress {
progressBar := pb.StartNew(0).Prefix("Comparing Documents")
progressBar.SetRefreshRate(500 * time.Millisecond)
progressBar.ShowElapsedTime = true
progressBar.ShowTimeLeft = true
for n := range compareOptions.Notifier {
progressBar.SetTotal64(n.Total)
progressBar.Set64(n.Done)
}
progressBar.Finish()
} else {
// Wait for notifier channel to be closed.
for range compareOptions.Notifier {
}
}
diffProgress := make(chan gedcom.Progress)
page := html.NewDiffPage(comparisons, filterFlags, optionGoogleAnalyticsID,
optionShow, optionSort, diffProgress, compareOptions, html.LivingVisibilityShow)
go func() {
_, err = page.WriteHTMLTo(out)
if err != nil {
log.Fatal(err)
}
close(diffProgress)
}()
if optionProgress {
progressBar := pb.StartNew(0).Prefix("Comparing Individuals")
progressBar.SetRefreshRate(500 * time.Millisecond)
progressBar.ShowElapsedTime = true
progressBar.ShowTimeLeft = true
for p := range diffProgress {
if p.Total != 0 {
progressBar.SetTotal64(p.Total)
}
progressBar.Add64(p.Add)
}
progressBar.Finish()
} else {
for range diffProgress {
}
}
}
func parseCLIFlags() {
// Input files. Must be provided.
flag.StringVar(&optionLeftGedcomFile, "left-gedcom", "",
"Required. Left GEDCOM file.")
flag.StringVar(&optionRightGedcomFile, "right-gedcom", "",
"Required. Right GEDCOM file.")
flag.StringVar(&optionOutputFile, "output", "", "Output file.")
flag.StringVar(&optionShow, "show", html.DiffPageShowAll, util.CLIDescription(`
The "-show" option controls which individuals are shown in the output:
"all": Default. Show all individuals from both files.
"only-matches": Only show individuals that match in both files. You can
control the threshold with the "-minimum-weighted-similarity" and
"-minimum-similarity" options. This is useful when comparing trees that
are unlikely to have many matches.
"subset": The right side will be considered a smaller part of the larger
left side. This means that individuals that entirely exist on the left
side will not be shown. This is useful when comparing a smaller part of
a tree with a larger tree.`))
flag.StringVar(&optionGoogleAnalyticsID, "google-analytics-id", "",
"The Google Analytics ID, like 'UA-78454410-2'.")
flag.BoolVar(&optionProgress, "progress", false, "Show progress bar.")
flag.IntVar(&optionJobs, "jobs", 1, util.CLIDescription(`Number of jobs to run in
parallel. If you are comparing large trees this will make the process
faster but will consume more CPU.`))
flag.Float64Var(&optionMinimumWeightedSimilarity,
"minimum-weighted-similarity", gedcom.DefaultMinimumSimilarity,
util.CLIDescription(`The weighted minimum similarity is the threshold
for whether two individuals should be the seen as the same person
when the surrounding immediate family is taken into consideration.
This value must be between 0 and 1 and is the primary way to adjust
the sensitivity of matches. It is best to also set
"-minimum-similarity" to the same value.
A higher value means you will get less matches but they will be of
higher quality. If you are comparing trees that do not share many of
the same individuals you should consider raising this to prevent
false-positives.`))
flag.Float64Var(&optionMinimumSimilarity,
"minimum-similarity", gedcom.DefaultMinimumSimilarity,
util.CLIDescription(`The minimum similarity is the threshold for
matching individuals as the same person. This is used to compare
only the individual (not surrounding family) like spouses and
children.
This value must be between 0 and 1 and should be set to the same
value as "minimum-weighted-similarity" if you are unsure.`))
flag.StringVar(&optionSort, "sort", html.DiffPageSortWrittenName,
util.CLIDescription(`
Controls how the individuals are sorted in the output:
"written-name": Sort individuals by written their written name.
"highest-similarity": Sort the individuals by their match
similarity. Highest matches will appear first.`))
flag.Float64Var(&optionPreferPointerAbove, "prefer-pointer-above",
gedcom.DefaultMinimumSimilarity, util.CLIDescription(fmt.Sprintf(`
Controls if two individuals should be considered a match by their
pointer value.
The default value is %f which means that the individuals will be
considered a match if they share the same pointer and hit the same
default minimum similarity.
A value of 1.0 would have to be a perfect match to be considered
equal on their pointer, this is the same as disabling the feature.
A value of 0.0 would mean that it always trusts the pointer match,
even if the individuals are nothing alike.
This option makes sense when you are comparing documents that have
come from the same base and retained the pointers between
individuals of the existing data.
`, gedcom.DefaultMinimumSimilarity)))
flag.BoolVar(&optionAllowMultiLine, "allow-multi-line", false,
util.CLIDescription(`
It is not valid for GEDCOM values to contain new lines or carriage
returns. However, some application dump data without correctly using
the CONT tags.
Strictly speaking we should bail out with an error but there are too
many cases that are difficult to clean up for consumers so we offer
and option to permit it.
When enabled any line than cannot be parsed will be considered an
extension of the previous line (including the new line character).
`))
flag.BoolVar(&optionAllowInvalidIndents, "allow-invalid-indents", false,
util.CLIDescription(`
When enabled, -allow-invalid-indents allows a child node to have an
indent greater than +1 of the parent. -allow-invalid-indents is
disabled by default because if this happens the GEDCOM file is
broken in some possibly serious way and certainly not a valid GEDCOM
file.
The biggest problem with having the indents wrongly aligned is that
nodes that are expected to be a certain depth (such as NPFX inside a
NAME) will probably break or interfere with a traversal algorithm
that is not expecting the node to be there/at that level. This may
lead to unexpected behavior.
`))
filterFlags.SetupCLI()
flag.Parse()
validateOptions()
}
func validateOptions() {
if optionLeftGedcomFile == "" {
log.Fatalf(`-left-gedcom is required`)
}
if optionRightGedcomFile == "" {
log.Fatalf(`-right-gedcom is required`)
}
if optionOutputFile == "" {
log.Fatalf(`-output is required`)
}
optionShowValues := gedcom.NewStringSet(
html.DiffPageShowAll,
html.DiffPageShowSubset,
html.DiffPageShowOnlyMatches,
)
if !optionShowValues.Has(optionShow) {
log.Fatalf(`invalid "-show" value: %s`, optionShow)
}
optionSortValues := gedcom.NewStringSet(
html.DiffPageSortWrittenName,
html.DiffPageSortHighestSimilarity,
)
if !optionSortValues.Has(optionSort) {
log.Fatalf(`invalid "-sort" value: %s`, optionSort)
}
}