-
-
Notifications
You must be signed in to change notification settings - Fork 27k
Expand file tree
/
Copy path_tree.pyx
More file actions
1998 lines (1620 loc) · 72.6 KB
/
_tree.pyx
File metadata and controls
1998 lines (1620 loc) · 72.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause
from cpython cimport Py_INCREF, PyObject, PyTypeObject
from libc.stdlib cimport free
from libc.string cimport memcpy
from libc.string cimport memset
from libc.stdint cimport INTPTR_MAX
from libc.math cimport isnan
from libcpp.vector cimport vector
from libcpp.algorithm cimport pop_heap
from libcpp.algorithm cimport push_heap
from libcpp.stack cimport stack
from libcpp cimport bool
import struct
import numpy as np
cimport numpy as cnp
cnp.import_array()
from scipy.sparse import issparse
from scipy.sparse import csr_array
from sklearn.utils import _align_api_if_sparse
from sklearn.tree._utils cimport safe_realloc
from sklearn.tree._utils cimport sizet_ptr_to_ndarray
cdef extern from "numpy/arrayobject.h":
object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,
int nd, cnp.npy_intp* dims,
cnp.npy_intp* strides,
void* data, int flags, object obj)
int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)
# =============================================================================
# Types and constants
# =============================================================================
from numpy import float32 as DTYPE
from numpy import float64 as DOUBLE
cdef float64_t INFINITY = np.inf
cdef float64_t EPSILON = np.finfo('double').eps
# Some handy constants (BestFirstTreeBuilder)
cdef bint IS_FIRST = 1
cdef bint IS_NOT_FIRST = 0
cdef bint IS_LEFT = 1
cdef bint IS_NOT_LEFT = 0
TREE_LEAF = -1
TREE_UNDEFINED = -2
cdef intp_t _TREE_LEAF = TREE_LEAF
cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED
# Build the corresponding numpy dtype for Node.
# This works by casting `dummy` to an array of Node of length 1, which numpy
# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946
# for a more detailed explanation.
cdef Node dummy
NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype
cdef inline void _init_parent_record(ParentInfo* record) noexcept nogil:
record.n_constant_features = 0
record.impurity = INFINITY
record.lower_bound = -INFINITY
record.upper_bound = INFINITY
# =============================================================================
# TreeBuilder
# =============================================================================
cdef class TreeBuilder:
"""Interface for different tree building strategies."""
cpdef build(
self,
Tree tree,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight=None,
const uint8_t[::1] missing_values_in_feature_mask=None,
):
"""Build a decision tree from the training set (X, y)."""
pass
cdef inline _check_input(
self,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight,
):
"""Check input dtype, layout and format"""
if issparse(X):
X = X.tocsc()
X.sort_indices()
if X.data.dtype != DTYPE:
X.data = np.ascontiguousarray(X.data, dtype=DTYPE)
if X.indices.dtype != np.int32 or X.indptr.dtype != np.int32:
raise ValueError("No support for np.int64 index based "
"sparse matrices")
elif X.dtype != DTYPE:
# since we have to copy we will make it fortran for efficiency
X = np.asfortranarray(X, dtype=DTYPE)
if sample_weight is not None and not sample_weight.base.flags.contiguous:
sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C")
return X, y, sample_weight
# Depth first builder ---------------------------------------------------------
# A record on the stack for depth-first tree growing
cdef struct StackRecord:
intp_t start
intp_t end
intp_t depth
intp_t parent
bint is_left
float64_t impurity
intp_t n_constant_features
float64_t lower_bound
float64_t upper_bound
cdef class DepthFirstTreeBuilder(TreeBuilder):
"""Build a decision tree in depth-first fashion."""
def __cinit__(self, Splitter splitter, intp_t min_samples_split,
intp_t min_samples_leaf, float64_t min_weight_leaf,
intp_t max_depth, float64_t min_impurity_decrease):
self.splitter = splitter
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_leaf = min_weight_leaf
self.max_depth = max_depth
self.min_impurity_decrease = min_impurity_decrease
cpdef build(
self,
Tree tree,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight=None,
const uint8_t[::1] missing_values_in_feature_mask=None,
):
"""Build a decision tree from the training set (X, y)."""
# check input
X, y, sample_weight = self._check_input(X, y, sample_weight)
# Initial capacity
cdef intp_t init_capacity
if tree.max_depth <= 10:
init_capacity = <intp_t> (2 ** (tree.max_depth + 1)) - 1
else:
init_capacity = 2047
tree._resize(init_capacity)
# Parameters
cdef Splitter splitter = self.splitter
cdef intp_t max_depth = self.max_depth
cdef intp_t min_samples_leaf = self.min_samples_leaf
cdef float64_t min_weight_leaf = self.min_weight_leaf
cdef intp_t min_samples_split = self.min_samples_split
cdef float64_t min_impurity_decrease = self.min_impurity_decrease
# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
cdef intp_t start
cdef intp_t end
cdef intp_t depth
cdef intp_t parent
cdef bint is_left
cdef intp_t n_node_samples = splitter.n_samples
cdef float64_t weighted_n_node_samples
cdef SplitRecord split
cdef intp_t node_id
cdef float64_t middle_value
cdef float64_t left_child_min
cdef float64_t left_child_max
cdef float64_t right_child_min
cdef float64_t right_child_max
cdef bint is_leaf
cdef bint first = 1
cdef intp_t max_depth_seen = -1
cdef int rc = 0
cdef stack[StackRecord] builder_stack
cdef StackRecord stack_record
cdef ParentInfo parent_record
_init_parent_record(&parent_record)
with nogil:
# push root node onto stack
builder_stack.push({
"start": 0,
"end": n_node_samples,
"depth": 0,
"parent": _TREE_UNDEFINED,
"is_left": 0,
"impurity": INFINITY,
"n_constant_features": 0,
"lower_bound": -INFINITY,
"upper_bound": INFINITY,
})
while not builder_stack.empty():
stack_record = builder_stack.top()
builder_stack.pop()
start = stack_record.start
end = stack_record.end
depth = stack_record.depth
parent = stack_record.parent
is_left = stack_record.is_left
parent_record.impurity = stack_record.impurity
parent_record.n_constant_features = stack_record.n_constant_features
parent_record.lower_bound = stack_record.lower_bound
parent_record.upper_bound = stack_record.upper_bound
n_node_samples = end - start
splitter.node_reset(start, end, &weighted_n_node_samples)
is_leaf = (depth >= max_depth or
n_node_samples < min_samples_split or
n_node_samples < 2 * min_samples_leaf or
weighted_n_node_samples < 2 * min_weight_leaf)
if first:
parent_record.impurity = splitter.node_impurity()
first = 0
# impurity == 0 with tolerance due to rounding errors
is_leaf = is_leaf or parent_record.impurity <= EPSILON
if not is_leaf:
splitter.node_split(
&parent_record,
&split,
)
# If EPSILON=0 in the below comparison, float precision
# issues stop splitting, producing trees that are
# dissimilar to v0.18
is_leaf = (is_leaf or split.pos >= end or
(split.improvement + EPSILON <
min_impurity_decrease))
node_id = tree._add_node(parent, is_left, is_leaf, split.feature,
split.threshold, parent_record.impurity,
n_node_samples, weighted_n_node_samples,
split.missing_go_to_left)
if node_id == INTPTR_MAX:
rc = -1
break
# Store value for all nodes, to facilitate tree/model
# inspection and interpretation
splitter.node_value(tree.value + node_id * tree.value_stride)
if splitter.with_monotonic_cst:
splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
if not is_leaf:
if (
not splitter.with_monotonic_cst or
splitter.monotonic_cst[split.feature] == 0
):
# Split on a feature with no monotonicity constraint
# Current bounds must always be propagated to both children.
# If a monotonic constraint is active, bounds are used in
# node value clipping.
left_child_min = right_child_min = parent_record.lower_bound
left_child_max = right_child_max = parent_record.upper_bound
elif splitter.monotonic_cst[split.feature] == 1:
# Split on a feature with monotonic increase constraint
left_child_min = parent_record.lower_bound
right_child_max = parent_record.upper_bound
# Lower bound for right child and upper bound for left child
# are set to the same value.
middle_value = splitter.criterion.middle_value()
right_child_min = middle_value
left_child_max = middle_value
else: # i.e. splitter.monotonic_cst[split.feature] == -1
# Split on a feature with monotonic decrease constraint
right_child_min = parent_record.lower_bound
left_child_max = parent_record.upper_bound
# Lower bound for left child and upper bound for right child
# are set to the same value.
middle_value = splitter.criterion.middle_value()
left_child_min = middle_value
right_child_max = middle_value
# Push right child on stack
builder_stack.push({
"start": split.pos,
"end": end,
"depth": depth + 1,
"parent": node_id,
"is_left": 0,
"impurity": split.impurity_right,
"n_constant_features": parent_record.n_constant_features,
"lower_bound": right_child_min,
"upper_bound": right_child_max,
})
# Push left child on stack
builder_stack.push({
"start": start,
"end": split.pos,
"depth": depth + 1,
"parent": node_id,
"is_left": 1,
"impurity": split.impurity_left,
"n_constant_features": parent_record.n_constant_features,
"lower_bound": left_child_min,
"upper_bound": left_child_max,
})
if depth > max_depth_seen:
max_depth_seen = depth
if rc >= 0:
rc = tree._resize_c(tree.node_count)
if rc >= 0:
tree.max_depth = max_depth_seen
if rc == -1:
raise MemoryError()
# Best first builder ----------------------------------------------------------
cdef struct FrontierRecord:
# Record of information of a Node, the frontier for a split. Those records are
# maintained in a heap to access the Node with the best improvement in impurity,
# allowing growing trees greedily on this improvement.
intp_t node_id
intp_t start
intp_t end
intp_t pos
intp_t depth
bint is_leaf
float64_t impurity
float64_t impurity_left
float64_t impurity_right
float64_t improvement
float64_t lower_bound
float64_t upper_bound
float64_t middle_value
cdef inline bool _compare_records(
const FrontierRecord& left,
const FrontierRecord& right,
):
return left.improvement < right.improvement
cdef inline void _add_to_frontier(
FrontierRecord rec,
vector[FrontierRecord]& frontier,
) noexcept nogil:
"""Adds record `rec` to the priority queue `frontier`."""
frontier.push_back(rec)
push_heap(frontier.begin(), frontier.end(), &_compare_records)
cdef class BestFirstTreeBuilder(TreeBuilder):
"""Build a decision tree in best-first fashion.
The best node to expand is given by the node at the frontier that has the
highest impurity improvement.
"""
cdef intp_t max_leaf_nodes
def __cinit__(self, Splitter splitter, intp_t min_samples_split,
intp_t min_samples_leaf, min_weight_leaf,
intp_t max_depth, intp_t max_leaf_nodes,
float64_t min_impurity_decrease):
self.splitter = splitter
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.min_weight_leaf = min_weight_leaf
self.max_depth = max_depth
self.max_leaf_nodes = max_leaf_nodes
self.min_impurity_decrease = min_impurity_decrease
cpdef build(
self,
Tree tree,
object X,
const float64_t[:, ::1] y,
const float64_t[:] sample_weight=None,
const uint8_t[::1] missing_values_in_feature_mask=None,
):
"""Build a decision tree from the training set (X, y)."""
# check input
X, y, sample_weight = self._check_input(X, y, sample_weight)
# Parameters
cdef Splitter splitter = self.splitter
cdef intp_t max_leaf_nodes = self.max_leaf_nodes
# Recursive partition (without actual recursion)
splitter.init(X, y, sample_weight, missing_values_in_feature_mask)
cdef vector[FrontierRecord] frontier
cdef FrontierRecord record
cdef FrontierRecord split_node_left
cdef FrontierRecord split_node_right
cdef float64_t left_child_min
cdef float64_t left_child_max
cdef float64_t right_child_min
cdef float64_t right_child_max
cdef intp_t n_node_samples = splitter.n_samples
cdef intp_t max_split_nodes = max_leaf_nodes - 1
cdef bint is_leaf
cdef intp_t max_depth_seen = -1
cdef int rc = 0
cdef Node* node
cdef ParentInfo parent_record
_init_parent_record(&parent_record)
# Initial capacity
cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes
tree._resize(init_capacity)
with nogil:
# add root to frontier
rc = self._add_split_node(
splitter=splitter,
tree=tree,
start=0,
end=n_node_samples,
is_first=IS_FIRST,
is_left=IS_LEFT,
parent=NULL,
depth=0,
parent_record=&parent_record,
res=&split_node_left,
)
if rc >= 0:
_add_to_frontier(split_node_left, frontier)
while not frontier.empty():
pop_heap(frontier.begin(), frontier.end(), &_compare_records)
record = frontier.back()
frontier.pop_back()
node = &tree.nodes[record.node_id]
is_leaf = (record.is_leaf or max_split_nodes <= 0)
if is_leaf:
# Node is not expandable; set node as leaf
node.left_child = _TREE_LEAF
node.right_child = _TREE_LEAF
node.feature = _TREE_UNDEFINED
node.threshold = _TREE_UNDEFINED
else:
# Node is expandable
if (
not splitter.with_monotonic_cst or
splitter.monotonic_cst[node.feature] == 0
):
# Split on a feature with no monotonicity constraint
# Current bounds must always be propagated to both children.
# If a monotonic constraint is active, bounds are used in
# node value clipping.
left_child_min = right_child_min = record.lower_bound
left_child_max = right_child_max = record.upper_bound
elif splitter.monotonic_cst[node.feature] == 1:
# Split on a feature with monotonic increase constraint
left_child_min = record.lower_bound
right_child_max = record.upper_bound
# Lower bound for right child and upper bound for left child
# are set to the same value.
right_child_min = record.middle_value
left_child_max = record.middle_value
else: # i.e. splitter.monotonic_cst[split.feature] == -1
# Split on a feature with monotonic decrease constraint
right_child_min = record.lower_bound
left_child_max = record.upper_bound
# Lower bound for left child and upper bound for right child
# are set to the same value.
left_child_min = record.middle_value
right_child_max = record.middle_value
# Decrement number of split nodes available
max_split_nodes -= 1
# Compute left split node
parent_record.lower_bound = left_child_min
parent_record.upper_bound = left_child_max
parent_record.impurity = record.impurity_left
rc = self._add_split_node(
splitter=splitter,
tree=tree,
start=record.start,
end=record.pos,
is_first=IS_NOT_FIRST,
is_left=IS_LEFT,
parent=node,
depth=record.depth + 1,
parent_record=&parent_record,
res=&split_node_left,
)
if rc == -1:
break
# tree.nodes may have changed
node = &tree.nodes[record.node_id]
# Compute right split node
parent_record.lower_bound = right_child_min
parent_record.upper_bound = right_child_max
parent_record.impurity = record.impurity_right
rc = self._add_split_node(
splitter=splitter,
tree=tree,
start=record.pos,
end=record.end,
is_first=IS_NOT_FIRST,
is_left=IS_NOT_LEFT,
parent=node,
depth=record.depth + 1,
parent_record=&parent_record,
res=&split_node_right,
)
if rc == -1:
break
# Add nodes to queue
_add_to_frontier(split_node_left, frontier)
_add_to_frontier(split_node_right, frontier)
if record.depth > max_depth_seen:
max_depth_seen = record.depth
if rc >= 0:
rc = tree._resize_c(tree.node_count)
if rc >= 0:
tree.max_depth = max_depth_seen
if rc == -1:
raise MemoryError()
cdef inline int _add_split_node(
self,
Splitter splitter,
Tree tree,
intp_t start,
intp_t end,
bint is_first,
bint is_left,
Node* parent,
intp_t depth,
ParentInfo* parent_record,
FrontierRecord* res
) except -1 nogil:
"""Adds node w/ partition ``[start, end)`` to the frontier. """
cdef SplitRecord split
cdef intp_t node_id
cdef intp_t n_node_samples
cdef float64_t min_impurity_decrease = self.min_impurity_decrease
cdef float64_t weighted_n_node_samples
cdef bint is_leaf
splitter.node_reset(start, end, &weighted_n_node_samples)
# reset n_constant_features for this specific split before beginning split search
parent_record.n_constant_features = 0
if is_first:
parent_record.impurity = splitter.node_impurity()
n_node_samples = end - start
is_leaf = (depth >= self.max_depth or
n_node_samples < self.min_samples_split or
n_node_samples < 2 * self.min_samples_leaf or
weighted_n_node_samples < 2 * self.min_weight_leaf or
parent_record.impurity <= EPSILON # impurity == 0 with tolerance
)
if not is_leaf:
splitter.node_split(
parent_record,
&split,
)
# If EPSILON=0 in the below comparison, float precision issues stop
# splitting early, producing trees that are dissimilar to v0.18
is_leaf = (is_leaf or split.pos >= end or
split.improvement + EPSILON < min_impurity_decrease)
node_id = tree._add_node(parent - tree.nodes
if parent != NULL
else _TREE_UNDEFINED,
is_left, is_leaf,
split.feature, split.threshold, parent_record.impurity,
n_node_samples, weighted_n_node_samples,
split.missing_go_to_left)
if node_id == INTPTR_MAX:
return -1
# compute values also for split nodes (might become leaves later).
splitter.node_value(tree.value + node_id * tree.value_stride)
if splitter.with_monotonic_cst:
splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)
res.node_id = node_id
res.start = start
res.end = end
res.depth = depth
res.impurity = parent_record.impurity
res.lower_bound = parent_record.lower_bound
res.upper_bound = parent_record.upper_bound
res.middle_value = splitter.criterion.middle_value()
if not is_leaf:
# is split node
res.pos = split.pos
res.is_leaf = 0
res.improvement = split.improvement
res.impurity_left = split.impurity_left
res.impurity_right = split.impurity_right
else:
# is leaf => 0 improvement
res.pos = end
res.is_leaf = 1
res.improvement = 0.0
res.impurity_left = parent_record.impurity
res.impurity_right = parent_record.impurity
return 0
# =============================================================================
# Tree
# =============================================================================
cdef class Tree:
"""Array-based representation of a binary decision tree.
The binary tree is represented as a number of parallel arrays. The i-th
element of each array holds information about the node `i`. Node 0 is the
tree's root. You can find a detailed description of all arrays in
`_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split
nodes, resp. In this case the values of nodes of the other type are
arbitrary!
Attributes
----------
node_count : intp_t
The number of nodes (internal nodes + leaves) in the tree.
capacity : intp_t
The current capacity (i.e., size) of the arrays, which is at least as
great as `node_count`.
max_depth : intp_t
The depth of the tree, i.e. the maximum depth of its leaves.
children_left : array of intp_t, shape [node_count]
children_left[i] holds the node id of the left child of node i.
For leaves, children_left[i] == TREE_LEAF. Otherwise,
children_left[i] > i. This child handles the case where
X[:, feature[i]] <= threshold[i].
children_right : array of intp_t, shape [node_count]
children_right[i] holds the node id of the right child of node i.
For leaves, children_right[i] == TREE_LEAF. Otherwise,
children_right[i] > i. This child handles the case where
X[:, feature[i]] > threshold[i].
n_leaves : intp_t
Number of leaves in the tree.
feature : array of intp_t, shape [node_count]
feature[i] holds the feature to split on, for the internal node i.
threshold : array of float64_t, shape [node_count]
threshold[i] holds the threshold for the internal node i.
value : array of float64_t, shape [node_count, n_outputs, max_n_classes]
Contains the constant prediction value of each node.
impurity : array of float64_t, shape [node_count]
impurity[i] holds the impurity (i.e., the value of the splitting
criterion) at node i.
n_node_samples : array of intp_t, shape [node_count]
n_node_samples[i] holds the number of training samples reaching node i.
weighted_n_node_samples : array of float64_t, shape [node_count]
weighted_n_node_samples[i] holds the weighted number of training samples
reaching node i.
missing_go_to_left : array of bool, shape [node_count]
missing_go_to_left[i] holds a bool indicating whether or not there were
missing values at node i.
"""
# Wrap for outside world.
# WARNING: these reference the current `nodes` and `value` buffers, which
# must not be freed by a subsequent memory allocation.
# (i.e. through `_resize` or `__setstate__`)
@property
def n_classes(self):
return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)
@property
def children_left(self):
return self._get_node_ndarray()['left_child'][:self.node_count]
@property
def children_right(self):
return self._get_node_ndarray()['right_child'][:self.node_count]
@property
def n_leaves(self):
return np.sum(np.logical_and(
self.children_left == -1,
self.children_right == -1))
@property
def feature(self):
return self._get_node_ndarray()['feature'][:self.node_count]
@property
def threshold(self):
return self._get_node_ndarray()['threshold'][:self.node_count]
@property
def impurity(self):
return self._get_node_ndarray()['impurity'][:self.node_count]
@property
def n_node_samples(self):
return self._get_node_ndarray()['n_node_samples'][:self.node_count]
@property
def weighted_n_node_samples(self):
return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]
@property
def missing_go_to_left(self):
return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]
@property
def value(self):
return self._get_value_ndarray()[:self.node_count]
# TODO: Convert n_classes to cython.integral memory view once
# https://github.com/cython/cython/issues/5243 is fixed
def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):
"""Constructor."""
cdef intp_t dummy = 0
size_t_dtype = np.array(dummy).dtype
n_classes = _check_n_classes(n_classes, size_t_dtype)
# Input/Output layout
self.n_features = n_features
self.n_outputs = n_outputs
self.n_classes = NULL
safe_realloc(&self.n_classes, n_outputs)
self.max_n_classes = np.max(n_classes)
self.value_stride = n_outputs * self.max_n_classes
cdef intp_t k
for k in range(n_outputs):
self.n_classes[k] = n_classes[k]
# Inner structures
self.max_depth = 0
self.node_count = 0
self.capacity = 0
self.value = NULL
self.nodes = NULL
def __dealloc__(self):
"""Destructor."""
# Free all inner structures
free(self.n_classes)
free(self.value)
free(self.nodes)
def __reduce__(self):
"""Reduce re-implementation, for pickling."""
return (Tree, (self.n_features,
sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),
self.n_outputs), self.__getstate__())
def __getstate__(self):
"""Getstate re-implementation, for pickling."""
d = {}
# capacity is inferred during the __setstate__ using nodes
d["max_depth"] = self.max_depth
d["node_count"] = self.node_count
d["nodes"] = self._get_node_ndarray()
d["values"] = self._get_value_ndarray()
return d
def __setstate__(self, d):
"""Setstate re-implementation, for unpickling."""
self.max_depth = d["max_depth"]
self.node_count = d["node_count"]
if 'nodes' not in d:
raise ValueError('You have loaded Tree version which '
'cannot be imported')
node_ndarray = d['nodes']
value_ndarray = d['values']
value_shape = (node_ndarray.shape[0], self.n_outputs,
self.max_n_classes)
node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)
value_ndarray = _check_value_ndarray(
value_ndarray,
expected_dtype=np.dtype(np.float64),
expected_shape=value_shape
)
self.capacity = node_ndarray.shape[0]
if self._resize_c(self.capacity) != 0:
raise MemoryError("resizing tree to %d" % self.capacity)
memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),
self.capacity * sizeof(Node))
memcpy(self.value, cnp.PyArray_DATA(value_ndarray),
self.capacity * self.value_stride * sizeof(float64_t))
cdef int _resize(self, intp_t capacity) except -1 nogil:
"""Resize all inner arrays to `capacity`, if `capacity` == -1, then
double the size of the inner arrays.
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
"""
if self._resize_c(capacity) != 0:
# Acquire gil only if we need to raise
with gil:
raise MemoryError()
cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:
"""Guts of _resize
Returns -1 in case of failure to allocate memory (and raise MemoryError)
or 0 otherwise.
"""
if capacity == self.capacity and self.nodes != NULL:
return 0
if capacity == INTPTR_MAX:
if self.capacity == 0:
capacity = 3 # default initial value
else:
capacity = 2 * self.capacity
safe_realloc(&self.nodes, capacity)
safe_realloc(&self.value, capacity * self.value_stride)
if capacity > self.capacity:
# value memory is initialised to 0 to enable classifier argmax
memset(<void*>(self.value + self.capacity * self.value_stride), 0,
(capacity - self.capacity) * self.value_stride *
sizeof(float64_t))
# node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct)
memset(<void*>(self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node))
# if capacity smaller than node_count, adjust the counter
if capacity < self.node_count:
self.node_count = capacity
self.capacity = capacity
return 0
cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,
intp_t feature, float64_t threshold, float64_t impurity,
intp_t n_node_samples,
float64_t weighted_n_node_samples,
uint8_t missing_go_to_left) except -1 nogil:
"""Add a node to the tree.
The new node registers itself as the child of its parent.
Returns (size_t)(-1) on error.
"""
cdef intp_t node_id = self.node_count
if node_id >= self.capacity:
if self._resize_c() != 0:
return INTPTR_MAX
cdef Node* node = &self.nodes[node_id]
node.impurity = impurity
node.n_node_samples = n_node_samples
node.weighted_n_node_samples = weighted_n_node_samples
if parent != _TREE_UNDEFINED:
if is_left:
self.nodes[parent].left_child = node_id
else:
self.nodes[parent].right_child = node_id
if is_leaf:
node.left_child = _TREE_LEAF
node.right_child = _TREE_LEAF
node.feature = _TREE_UNDEFINED
node.threshold = _TREE_UNDEFINED
else:
# left_child and right_child will be set later
node.feature = feature
node.threshold = threshold
node.missing_go_to_left = missing_go_to_left
self.node_count += 1
return node_id
cpdef cnp.ndarray predict(self, object X):
"""Predict target for X."""
out = self._get_value_ndarray().take(self.apply(X), axis=0,
mode='clip')
if self.n_outputs == 1:
out = out.reshape(X.shape[0], self.max_n_classes)
return out
cpdef cnp.ndarray apply(self, object X):
"""Finds the terminal region (=leaf node) for each sample in X."""
if issparse(X):
return self._apply_sparse_csr(X)
else:
return self._apply_dense(X)
cdef inline cnp.ndarray _apply_dense(self, object X):
"""Finds the terminal region (=leaf node) for each sample in X."""
# Check input
if not isinstance(X, np.ndarray):
raise ValueError("X should be in np.ndarray format, got %s"
% type(X))
if X.dtype != DTYPE:
raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)
# Extract input
cdef const float32_t[:, :] X_ndarray = X
cdef intp_t n_samples = X.shape[0]
cdef float32_t X_i_node_feature
# Initialize output
cdef intp_t[:] out = np.zeros(n_samples, dtype=np.intp)
# Initialize auxiliary data-structure
cdef Node* node = NULL
cdef intp_t i = 0
with nogil:
for i in range(n_samples):
node = self.nodes
# While node not a leaf
while node.left_child != _TREE_LEAF:
X_i_node_feature = X_ndarray[i, node.feature]
# ... and node.right_child != _TREE_LEAF:
if isnan(X_i_node_feature):
if node.missing_go_to_left:
node = &self.nodes[node.left_child]
else:
node = &self.nodes[node.right_child]
elif X_i_node_feature <= node.threshold:
node = &self.nodes[node.left_child]
else:
node = &self.nodes[node.right_child]
out[i] = <intp_t>(node - self.nodes) # node offset
return np.asarray(out)