scikit-learn/sklearn/tree/_tree.pyx at main · scikit-learn/scikit-learn

History

1998 lines (1620 loc) · 72.6 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

# Authors: The scikit-learn developers

# SPDX-License-Identifier: BSD-3-Clause

from cpython cimport Py_INCREF, PyObject, PyTypeObject

from libc.stdlib cimport free

from libc.string cimport memcpy

from libc.string cimport memset

from libc.stdint cimport INTPTR_MAX

from libc.math cimport isnan

from libcpp.vector cimport vector

from libcpp.algorithm cimport pop_heap

from libcpp.algorithm cimport push_heap

from libcpp.stack cimport stack

from libcpp cimport bool

import struct

import numpy as np

cimport numpy as cnp

cnp.import_array()

from scipy.sparse import issparse

from scipy.sparse import csr_array

from sklearn.utils import _align_api_if_sparse

from sklearn.tree._utils cimport safe_realloc

from sklearn.tree._utils cimport sizet_ptr_to_ndarray

cdef extern from "numpy/arrayobject.h":

object PyArray_NewFromDescr(PyTypeObject* subtype, cnp.dtype descr,

int nd, cnp.npy_intp* dims,

cnp.npy_intp* strides,

void* data, int flags, object obj)

int PyArray_SetBaseObject(cnp.ndarray arr, PyObject* obj)

# =============================================================================

# Types and constants

# =============================================================================

from numpy import float32 as DTYPE

from numpy import float64 as DOUBLE

cdef float64_t INFINITY = np.inf

cdef float64_t EPSILON = np.finfo('double').eps

# Some handy constants (BestFirstTreeBuilder)

cdef bint IS_FIRST = 1

cdef bint IS_NOT_FIRST = 0

cdef bint IS_LEFT = 1

cdef bint IS_NOT_LEFT = 0

TREE_LEAF = -1

TREE_UNDEFINED = -2

cdef intp_t _TREE_LEAF = TREE_LEAF

cdef intp_t _TREE_UNDEFINED = TREE_UNDEFINED

# Build the corresponding numpy dtype for Node.

# This works by casting `dummy` to an array of Node of length 1, which numpy

# can construct a `dtype`-object for. See https://stackoverflow.com/q/62448946

# for a more detailed explanation.

cdef Node dummy

NODE_DTYPE = np.asarray(<Node[:1]>(&dummy)).dtype

cdef inline void _init_parent_record(ParentInfo* record) noexcept nogil:

record.n_constant_features = 0

record.impurity = INFINITY

record.lower_bound = -INFINITY

record.upper_bound = INFINITY

# =============================================================================

# TreeBuilder

# =============================================================================

cdef class TreeBuilder:

"""Interface for different tree building strategies."""

cpdef build(

self,

Tree tree,

object X,

const float64_t[:, ::1] y,

const float64_t[:] sample_weight=None,

const uint8_t[::1] missing_values_in_feature_mask=None,

"""Build a decision tree from the training set (X, y)."""

pass

cdef inline _check_input(

self,

object X,

const float64_t[:, ::1] y,

const float64_t[:] sample_weight,

"""Check input dtype, layout and format"""

if issparse(X):

X = X.tocsc()

X.sort_indices()

if X.data.dtype != DTYPE:

X.data = np.ascontiguousarray(X.data, dtype=DTYPE)

if X.indices.dtype != np.int32 or X.indptr.dtype != np.int32:

raise ValueError("No support for np.int64 index based "

"sparse matrices")

elif X.dtype != DTYPE:

# since we have to copy we will make it fortran for efficiency

X = np.asfortranarray(X, dtype=DTYPE)

if sample_weight is not None and not sample_weight.base.flags.contiguous:

sample_weight = np.asarray(sample_weight, dtype=DOUBLE, order="C")

return X, y, sample_weight

# Depth first builder ---------------------------------------------------------

# A record on the stack for depth-first tree growing

cdef struct StackRecord:

intp_t start

intp_t end

intp_t depth

intp_t parent

bint is_left

float64_t impurity

intp_t n_constant_features

float64_t lower_bound

float64_t upper_bound

cdef class DepthFirstTreeBuilder(TreeBuilder):

"""Build a decision tree in depth-first fashion."""

def __cinit__(self, Splitter splitter, intp_t min_samples_split,

intp_t min_samples_leaf, float64_t min_weight_leaf,

intp_t max_depth, float64_t min_impurity_decrease):

self.splitter = splitter

self.min_samples_split = min_samples_split

self.min_samples_leaf = min_samples_leaf

self.min_weight_leaf = min_weight_leaf

self.max_depth = max_depth

self.min_impurity_decrease = min_impurity_decrease

cpdef build(

self,

Tree tree,

object X,

const float64_t[:, ::1] y,

const float64_t[:] sample_weight=None,

const uint8_t[::1] missing_values_in_feature_mask=None,

"""Build a decision tree from the training set (X, y)."""

# check input

X, y, sample_weight = self._check_input(X, y, sample_weight)

# Initial capacity

cdef intp_t init_capacity

if tree.max_depth <= 10:

init_capacity = <intp_t> (2 ** (tree.max_depth + 1)) - 1

else:

init_capacity = 2047

tree._resize(init_capacity)

# Parameters

cdef Splitter splitter = self.splitter

cdef intp_t max_depth = self.max_depth

cdef intp_t min_samples_leaf = self.min_samples_leaf

cdef float64_t min_weight_leaf = self.min_weight_leaf

cdef intp_t min_samples_split = self.min_samples_split

cdef float64_t min_impurity_decrease = self.min_impurity_decrease

# Recursive partition (without actual recursion)

splitter.init(X, y, sample_weight, missing_values_in_feature_mask)

cdef intp_t start

cdef intp_t end

cdef intp_t depth

cdef intp_t parent

cdef bint is_left

cdef intp_t n_node_samples = splitter.n_samples

cdef float64_t weighted_n_node_samples

cdef SplitRecord split

cdef intp_t node_id

cdef float64_t middle_value

cdef float64_t left_child_min

cdef float64_t left_child_max

cdef float64_t right_child_min

cdef float64_t right_child_max

cdef bint is_leaf

cdef bint first = 1

cdef intp_t max_depth_seen = -1

cdef int rc = 0

cdef stack[StackRecord] builder_stack

cdef StackRecord stack_record

cdef ParentInfo parent_record

_init_parent_record(&parent_record)

with nogil:

# push root node onto stack

builder_stack.push({

"start": 0,

"end": n_node_samples,

"depth": 0,

"parent": _TREE_UNDEFINED,

"is_left": 0,

"impurity": INFINITY,

"n_constant_features": 0,

"lower_bound": -INFINITY,

"upper_bound": INFINITY,

})

while not builder_stack.empty():

stack_record = builder_stack.top()

builder_stack.pop()

start = stack_record.start

end = stack_record.end

depth = stack_record.depth

parent = stack_record.parent

is_left = stack_record.is_left

parent_record.impurity = stack_record.impurity

parent_record.n_constant_features = stack_record.n_constant_features

parent_record.lower_bound = stack_record.lower_bound

parent_record.upper_bound = stack_record.upper_bound

n_node_samples = end - start

splitter.node_reset(start, end, &weighted_n_node_samples)

is_leaf = (depth >= max_depth or

n_node_samples < min_samples_split or

n_node_samples < 2 * min_samples_leaf or

weighted_n_node_samples < 2 * min_weight_leaf)

if first:

parent_record.impurity = splitter.node_impurity()

first = 0

# impurity == 0 with tolerance due to rounding errors

is_leaf = is_leaf or parent_record.impurity <= EPSILON

if not is_leaf:

splitter.node_split(

&parent_record,

&split,

)

# If EPSILON=0 in the below comparison, float precision

# issues stop splitting, producing trees that are

# dissimilar to v0.18

is_leaf = (is_leaf or split.pos >= end or

(split.improvement + EPSILON <

min_impurity_decrease))

node_id = tree._add_node(parent, is_left, is_leaf, split.feature,

split.threshold, parent_record.impurity,

n_node_samples, weighted_n_node_samples,

split.missing_go_to_left)

if node_id == INTPTR_MAX:

rc = -1

break

# Store value for all nodes, to facilitate tree/model

# inspection and interpretation

splitter.node_value(tree.value + node_id * tree.value_stride)

if splitter.with_monotonic_cst:

splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)

if not is_leaf:

if (

not splitter.with_monotonic_cst or

splitter.monotonic_cst[split.feature] == 0

# Split on a feature with no monotonicity constraint

# Current bounds must always be propagated to both children.

# If a monotonic constraint is active, bounds are used in

# node value clipping.

left_child_min = right_child_min = parent_record.lower_bound

left_child_max = right_child_max = parent_record.upper_bound

elif splitter.monotonic_cst[split.feature] == 1:

# Split on a feature with monotonic increase constraint

left_child_min = parent_record.lower_bound

right_child_max = parent_record.upper_bound

# Lower bound for right child and upper bound for left child

# are set to the same value.

middle_value = splitter.criterion.middle_value()

right_child_min = middle_value

left_child_max = middle_value

else: # i.e. splitter.monotonic_cst[split.feature] == -1

# Split on a feature with monotonic decrease constraint

right_child_min = parent_record.lower_bound

left_child_max = parent_record.upper_bound

# Lower bound for left child and upper bound for right child

# are set to the same value.

middle_value = splitter.criterion.middle_value()

left_child_min = middle_value

right_child_max = middle_value

# Push right child on stack

builder_stack.push({

"start": split.pos,

"end": end,

"depth": depth + 1,

"parent": node_id,

"is_left": 0,

"impurity": split.impurity_right,

"n_constant_features": parent_record.n_constant_features,

"lower_bound": right_child_min,

"upper_bound": right_child_max,

})

# Push left child on stack

builder_stack.push({

"start": start,

"end": split.pos,

"depth": depth + 1,

"parent": node_id,

"is_left": 1,

"impurity": split.impurity_left,

"n_constant_features": parent_record.n_constant_features,

"lower_bound": left_child_min,

"upper_bound": left_child_max,

})

if depth > max_depth_seen:

max_depth_seen = depth

if rc >= 0:

rc = tree._resize_c(tree.node_count)

if rc >= 0:

tree.max_depth = max_depth_seen

if rc == -1:

raise MemoryError()

# Best first builder ----------------------------------------------------------

cdef struct FrontierRecord:

# Record of information of a Node, the frontier for a split. Those records are

# maintained in a heap to access the Node with the best improvement in impurity,

# allowing growing trees greedily on this improvement.

intp_t node_id

intp_t start

intp_t end

intp_t pos

intp_t depth

bint is_leaf

float64_t impurity

float64_t impurity_left

float64_t impurity_right

float64_t improvement

float64_t lower_bound

float64_t upper_bound

float64_t middle_value

cdef inline bool _compare_records(

const FrontierRecord& left,

const FrontierRecord& right,

return left.improvement < right.improvement

cdef inline void _add_to_frontier(

FrontierRecord rec,

vector[FrontierRecord]& frontier,

) noexcept nogil:

"""Adds record `rec` to the priority queue `frontier`."""

frontier.push_back(rec)

push_heap(frontier.begin(), frontier.end(), &_compare_records)

cdef class BestFirstTreeBuilder(TreeBuilder):

"""Build a decision tree in best-first fashion.

The best node to expand is given by the node at the frontier that has the

highest impurity improvement.

"""

cdef intp_t max_leaf_nodes

def __cinit__(self, Splitter splitter, intp_t min_samples_split,

intp_t min_samples_leaf, min_weight_leaf,

intp_t max_depth, intp_t max_leaf_nodes,

float64_t min_impurity_decrease):

self.splitter = splitter

self.min_samples_split = min_samples_split

self.min_samples_leaf = min_samples_leaf

self.min_weight_leaf = min_weight_leaf

self.max_depth = max_depth

self.max_leaf_nodes = max_leaf_nodes

self.min_impurity_decrease = min_impurity_decrease

cpdef build(

self,

Tree tree,

object X,

const float64_t[:, ::1] y,

const float64_t[:] sample_weight=None,

const uint8_t[::1] missing_values_in_feature_mask=None,

"""Build a decision tree from the training set (X, y)."""

# check input

X, y, sample_weight = self._check_input(X, y, sample_weight)

# Parameters

cdef Splitter splitter = self.splitter

cdef intp_t max_leaf_nodes = self.max_leaf_nodes

# Recursive partition (without actual recursion)

splitter.init(X, y, sample_weight, missing_values_in_feature_mask)

cdef vector[FrontierRecord] frontier

cdef FrontierRecord record

cdef FrontierRecord split_node_left

cdef FrontierRecord split_node_right

cdef float64_t left_child_min

cdef float64_t left_child_max

cdef float64_t right_child_min

cdef float64_t right_child_max

cdef intp_t n_node_samples = splitter.n_samples

cdef intp_t max_split_nodes = max_leaf_nodes - 1

cdef bint is_leaf

cdef intp_t max_depth_seen = -1

cdef int rc = 0

cdef Node* node

cdef ParentInfo parent_record

_init_parent_record(&parent_record)

# Initial capacity

cdef intp_t init_capacity = max_split_nodes + max_leaf_nodes

tree._resize(init_capacity)

with nogil:

# add root to frontier

rc = self._add_split_node(

splitter=splitter,

tree=tree,

start=0,

end=n_node_samples,

is_first=IS_FIRST,

is_left=IS_LEFT,

parent=NULL,

depth=0,

parent_record=&parent_record,

res=&split_node_left,

)

if rc >= 0:

_add_to_frontier(split_node_left, frontier)

while not frontier.empty():

pop_heap(frontier.begin(), frontier.end(), &_compare_records)

record = frontier.back()

frontier.pop_back()

node = &tree.nodes[record.node_id]

is_leaf = (record.is_leaf or max_split_nodes <= 0)

if is_leaf:

# Node is not expandable; set node as leaf

node.left_child = _TREE_LEAF

node.right_child = _TREE_LEAF

node.feature = _TREE_UNDEFINED

node.threshold = _TREE_UNDEFINED

else:

# Node is expandable

if (

not splitter.with_monotonic_cst or

splitter.monotonic_cst[node.feature] == 0

# Split on a feature with no monotonicity constraint

# Current bounds must always be propagated to both children.

# If a monotonic constraint is active, bounds are used in

# node value clipping.

left_child_min = right_child_min = record.lower_bound

left_child_max = right_child_max = record.upper_bound

elif splitter.monotonic_cst[node.feature] == 1:

# Split on a feature with monotonic increase constraint

left_child_min = record.lower_bound

right_child_max = record.upper_bound

# Lower bound for right child and upper bound for left child

# are set to the same value.

right_child_min = record.middle_value

left_child_max = record.middle_value

else: # i.e. splitter.monotonic_cst[split.feature] == -1

# Split on a feature with monotonic decrease constraint

right_child_min = record.lower_bound

left_child_max = record.upper_bound

# Lower bound for left child and upper bound for right child

# are set to the same value.

left_child_min = record.middle_value

right_child_max = record.middle_value

# Decrement number of split nodes available

max_split_nodes -= 1

# Compute left split node

parent_record.lower_bound = left_child_min

parent_record.upper_bound = left_child_max

parent_record.impurity = record.impurity_left

rc = self._add_split_node(

splitter=splitter,

tree=tree,

start=record.start,

end=record.pos,

is_first=IS_NOT_FIRST,

is_left=IS_LEFT,

parent=node,

depth=record.depth + 1,

parent_record=&parent_record,

res=&split_node_left,

)

if rc == -1:

break

# tree.nodes may have changed

node = &tree.nodes[record.node_id]

# Compute right split node

parent_record.lower_bound = right_child_min

parent_record.upper_bound = right_child_max

parent_record.impurity = record.impurity_right

rc = self._add_split_node(

splitter=splitter,

tree=tree,

start=record.pos,

end=record.end,

is_first=IS_NOT_FIRST,

is_left=IS_NOT_LEFT,

parent=node,

depth=record.depth + 1,

parent_record=&parent_record,

res=&split_node_right,

)

if rc == -1:

break

# Add nodes to queue

_add_to_frontier(split_node_left, frontier)

_add_to_frontier(split_node_right, frontier)

if record.depth > max_depth_seen:

max_depth_seen = record.depth

if rc >= 0:

rc = tree._resize_c(tree.node_count)

if rc >= 0:

tree.max_depth = max_depth_seen

if rc == -1:

raise MemoryError()

cdef inline int _add_split_node(

self,

Splitter splitter,

Tree tree,

intp_t start,

intp_t end,

bint is_first,

bint is_left,

Node* parent,

intp_t depth,

ParentInfo* parent_record,

FrontierRecord* res

) except -1 nogil:

"""Adds node w/ partition ``[start, end)`` to the frontier. """

cdef SplitRecord split

cdef intp_t node_id

cdef intp_t n_node_samples

cdef float64_t min_impurity_decrease = self.min_impurity_decrease

cdef float64_t weighted_n_node_samples

cdef bint is_leaf

splitter.node_reset(start, end, &weighted_n_node_samples)

# reset n_constant_features for this specific split before beginning split search

parent_record.n_constant_features = 0

if is_first:

parent_record.impurity = splitter.node_impurity()

n_node_samples = end - start

is_leaf = (depth >= self.max_depth or

n_node_samples < self.min_samples_split or

n_node_samples < 2 * self.min_samples_leaf or

weighted_n_node_samples < 2 * self.min_weight_leaf or

parent_record.impurity <= EPSILON # impurity == 0 with tolerance

)

if not is_leaf:

splitter.node_split(

parent_record,

&split,

)

# If EPSILON=0 in the below comparison, float precision issues stop

# splitting early, producing trees that are dissimilar to v0.18

is_leaf = (is_leaf or split.pos >= end or

split.improvement + EPSILON < min_impurity_decrease)

node_id = tree._add_node(parent - tree.nodes

if parent != NULL

else _TREE_UNDEFINED,

is_left, is_leaf,

split.feature, split.threshold, parent_record.impurity,

n_node_samples, weighted_n_node_samples,

split.missing_go_to_left)

if node_id == INTPTR_MAX:

return -1

# compute values also for split nodes (might become leaves later).

splitter.node_value(tree.value + node_id * tree.value_stride)

if splitter.with_monotonic_cst:

splitter.clip_node_value(tree.value + node_id * tree.value_stride, parent_record.lower_bound, parent_record.upper_bound)

res.node_id = node_id

res.start = start

res.end = end

res.depth = depth

res.impurity = parent_record.impurity

res.lower_bound = parent_record.lower_bound

res.upper_bound = parent_record.upper_bound

res.middle_value = splitter.criterion.middle_value()

if not is_leaf:

# is split node

res.pos = split.pos

res.is_leaf = 0

res.improvement = split.improvement

res.impurity_left = split.impurity_left

res.impurity_right = split.impurity_right

else:

# is leaf => 0 improvement

res.pos = end

res.is_leaf = 1

res.improvement = 0.0

res.impurity_left = parent_record.impurity

res.impurity_right = parent_record.impurity

return 0

# =============================================================================

# Tree

# =============================================================================

cdef class Tree:

"""Array-based representation of a binary decision tree.

The binary tree is represented as a number of parallel arrays. The i-th

element of each array holds information about the node `i`. Node 0 is the

tree's root. You can find a detailed description of all arrays in

`_tree.pxd`. NOTE: Some of the arrays only apply to either leaves or split

nodes, resp. In this case the values of nodes of the other type are

arbitrary!

Attributes

----------

node_count : intp_t

The number of nodes (internal nodes + leaves) in the tree.

capacity : intp_t

The current capacity (i.e., size) of the arrays, which is at least as

great as `node_count`.

max_depth : intp_t

The depth of the tree, i.e. the maximum depth of its leaves.

children_left : array of intp_t, shape [node_count]

children_left[i] holds the node id of the left child of node i.

For leaves, children_left[i] == TREE_LEAF. Otherwise,

children_left[i] > i. This child handles the case where

X[:, feature[i]] <= threshold[i].

children_right : array of intp_t, shape [node_count]

children_right[i] holds the node id of the right child of node i.

For leaves, children_right[i] == TREE_LEAF. Otherwise,

children_right[i] > i. This child handles the case where

X[:, feature[i]] > threshold[i].

n_leaves : intp_t

Number of leaves in the tree.

feature : array of intp_t, shape [node_count]

feature[i] holds the feature to split on, for the internal node i.

threshold : array of float64_t, shape [node_count]

threshold[i] holds the threshold for the internal node i.

value : array of float64_t, shape [node_count, n_outputs, max_n_classes]

Contains the constant prediction value of each node.

impurity : array of float64_t, shape [node_count]

impurity[i] holds the impurity (i.e., the value of the splitting

criterion) at node i.

n_node_samples : array of intp_t, shape [node_count]

n_node_samples[i] holds the number of training samples reaching node i.

weighted_n_node_samples : array of float64_t, shape [node_count]

weighted_n_node_samples[i] holds the weighted number of training samples

reaching node i.

missing_go_to_left : array of bool, shape [node_count]

missing_go_to_left[i] holds a bool indicating whether or not there were

missing values at node i.

"""

# Wrap for outside world.

# WARNING: these reference the current `nodes` and `value` buffers, which

# must not be freed by a subsequent memory allocation.

# (i.e. through `_resize` or `__setstate__`)

@property

def n_classes(self):

return sizet_ptr_to_ndarray(self.n_classes, self.n_outputs)

@property

def children_left(self):

return self._get_node_ndarray()['left_child'][:self.node_count]

@property

def children_right(self):

return self._get_node_ndarray()['right_child'][:self.node_count]

@property

def n_leaves(self):

return np.sum(np.logical_and(

self.children_left == -1,

self.children_right == -1))

@property

def feature(self):

return self._get_node_ndarray()['feature'][:self.node_count]

@property

def threshold(self):

return self._get_node_ndarray()['threshold'][:self.node_count]

@property

def impurity(self):

return self._get_node_ndarray()['impurity'][:self.node_count]

@property

def n_node_samples(self):

return self._get_node_ndarray()['n_node_samples'][:self.node_count]

@property

def weighted_n_node_samples(self):

return self._get_node_ndarray()['weighted_n_node_samples'][:self.node_count]

@property

def missing_go_to_left(self):

return self._get_node_ndarray()['missing_go_to_left'][:self.node_count]

@property

def value(self):

return self._get_value_ndarray()[:self.node_count]

# TODO: Convert n_classes to cython.integral memory view once

# https://github.com/cython/cython/issues/5243 is fixed

def __cinit__(self, intp_t n_features, cnp.ndarray n_classes, intp_t n_outputs):

"""Constructor."""

cdef intp_t dummy = 0

size_t_dtype = np.array(dummy).dtype

n_classes = _check_n_classes(n_classes, size_t_dtype)

# Input/Output layout

self.n_features = n_features

self.n_outputs = n_outputs

self.n_classes = NULL

safe_realloc(&self.n_classes, n_outputs)

self.max_n_classes = np.max(n_classes)

self.value_stride = n_outputs * self.max_n_classes

cdef intp_t k

for k in range(n_outputs):

self.n_classes[k] = n_classes[k]

# Inner structures

self.max_depth = 0

self.node_count = 0

self.capacity = 0

self.value = NULL

self.nodes = NULL

def __dealloc__(self):

"""Destructor."""

# Free all inner structures

free(self.n_classes)

free(self.value)

free(self.nodes)

def __reduce__(self):

"""Reduce re-implementation, for pickling."""

return (Tree, (self.n_features,

sizet_ptr_to_ndarray(self.n_classes, self.n_outputs),

self.n_outputs), self.__getstate__())

def __getstate__(self):

"""Getstate re-implementation, for pickling."""

d = {}

# capacity is inferred during the __setstate__ using nodes

d["max_depth"] = self.max_depth

d["node_count"] = self.node_count

d["nodes"] = self._get_node_ndarray()

d["values"] = self._get_value_ndarray()

return d

def __setstate__(self, d):

"""Setstate re-implementation, for unpickling."""

self.max_depth = d["max_depth"]

self.node_count = d["node_count"]

if 'nodes' not in d:

raise ValueError('You have loaded Tree version which '

'cannot be imported')

node_ndarray = d['nodes']

value_ndarray = d['values']

value_shape = (node_ndarray.shape[0], self.n_outputs,

self.max_n_classes)

node_ndarray = _check_node_ndarray(node_ndarray, expected_dtype=NODE_DTYPE)

value_ndarray = _check_value_ndarray(

value_ndarray,

expected_dtype=np.dtype(np.float64),

expected_shape=value_shape

)

self.capacity = node_ndarray.shape[0]

if self._resize_c(self.capacity) != 0:

raise MemoryError("resizing tree to %d" % self.capacity)

memcpy(self.nodes, cnp.PyArray_DATA(node_ndarray),

self.capacity * sizeof(Node))

memcpy(self.value, cnp.PyArray_DATA(value_ndarray),

self.capacity * self.value_stride * sizeof(float64_t))

cdef int _resize(self, intp_t capacity) except -1 nogil:

"""Resize all inner arrays to `capacity`, if `capacity` == -1, then

double the size of the inner arrays.

Returns -1 in case of failure to allocate memory (and raise MemoryError)

or 0 otherwise.

"""

if self._resize_c(capacity) != 0:

# Acquire gil only if we need to raise

with gil:

raise MemoryError()

cdef int _resize_c(self, intp_t capacity=INTPTR_MAX) except -1 nogil:

"""Guts of _resize

Returns -1 in case of failure to allocate memory (and raise MemoryError)

or 0 otherwise.

"""

if capacity == self.capacity and self.nodes != NULL:

return 0

if capacity == INTPTR_MAX:

if self.capacity == 0:

capacity = 3 # default initial value

else:

capacity = 2 * self.capacity

safe_realloc(&self.nodes, capacity)

safe_realloc(&self.value, capacity * self.value_stride)

if capacity > self.capacity:

# value memory is initialised to 0 to enable classifier argmax

memset(<void*>(self.value + self.capacity * self.value_stride), 0,

(capacity - self.capacity) * self.value_stride *

sizeof(float64_t))

# node memory is initialised to 0 to ensure deterministic pickle (padding in Node struct)

memset(<void*>(self.nodes + self.capacity), 0, (capacity - self.capacity) * sizeof(Node))

# if capacity smaller than node_count, adjust the counter

if capacity < self.node_count:

self.node_count = capacity

self.capacity = capacity

return 0

cdef intp_t _add_node(self, intp_t parent, bint is_left, bint is_leaf,

intp_t feature, float64_t threshold, float64_t impurity,

intp_t n_node_samples,

float64_t weighted_n_node_samples,

uint8_t missing_go_to_left) except -1 nogil:

"""Add a node to the tree.

The new node registers itself as the child of its parent.

Returns (size_t)(-1) on error.

"""

cdef intp_t node_id = self.node_count

if node_id >= self.capacity:

if self._resize_c() != 0:

return INTPTR_MAX

cdef Node* node = &self.nodes[node_id]

node.impurity = impurity

node.n_node_samples = n_node_samples

node.weighted_n_node_samples = weighted_n_node_samples

if parent != _TREE_UNDEFINED:

if is_left:

self.nodes[parent].left_child = node_id

else:

self.nodes[parent].right_child = node_id

if is_leaf:

node.left_child = _TREE_LEAF

node.right_child = _TREE_LEAF

node.feature = _TREE_UNDEFINED

node.threshold = _TREE_UNDEFINED

else:

# left_child and right_child will be set later

node.feature = feature

node.threshold = threshold

node.missing_go_to_left = missing_go_to_left

self.node_count += 1

return node_id

cpdef cnp.ndarray predict(self, object X):

"""Predict target for X."""

out = self._get_value_ndarray().take(self.apply(X), axis=0,

mode='clip')

if self.n_outputs == 1:

out = out.reshape(X.shape[0], self.max_n_classes)

return out

cpdef cnp.ndarray apply(self, object X):

"""Finds the terminal region (=leaf node) for each sample in X."""

if issparse(X):

return self._apply_sparse_csr(X)

else:

return self._apply_dense(X)

cdef inline cnp.ndarray _apply_dense(self, object X):

"""Finds the terminal region (=leaf node) for each sample in X."""

# Check input

if not isinstance(X, np.ndarray):

raise ValueError("X should be in np.ndarray format, got %s"

% type(X))

if X.dtype != DTYPE:

raise ValueError("X.dtype should be np.float32, got %s" % X.dtype)

# Extract input

cdef const float32_t[:, :] X_ndarray = X

cdef intp_t n_samples = X.shape[0]

cdef float32_t X_i_node_feature

# Initialize output

cdef intp_t[:] out = np.zeros(n_samples, dtype=np.intp)

# Initialize auxiliary data-structure

cdef Node* node = NULL

cdef intp_t i = 0

with nogil:

for i in range(n_samples):

node = self.nodes

# While node not a leaf

while node.left_child != _TREE_LEAF:

X_i_node_feature = X_ndarray[i, node.feature]

# ... and node.right_child != _TREE_LEAF:

if isnan(X_i_node_feature):

if node.missing_go_to_left:

node = &self.nodes[node.left_child]

else:

node = &self.nodes[node.right_child]

elif X_i_node_feature <= node.threshold:

node = &self.nodes[node.left_child]

else:

node = &self.nodes[node.right_child]

out[i] = <intp_t>(node - self.nodes) # node offset

return np.asarray(out)

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

FilesExpand file tree

_tree.pyx

Latest commit

History

_tree.pyx

File metadata and controls