A pure Python library for SMILES/SMARTS parsing, canonicalization, and molecular manipulation.
pip install -e .pip install chiralipyfrom chiralipy import parse_smiles, canonical_smiles
# Parse and canonicalize
mol = parse_smiles("C(C)CC")
print(canonical_smiles(mol)) # CCCC
# Substructure matching
from chiralipy.match import substructure_search, has_substructure
mol = parse_smiles("c1ccccc1CCO")
pattern = parse_smiles("c1ccccc1")
print(has_substructure(mol, pattern)) # True
matches = substructure_search(mol, pattern) # [(0, 1, 2, 3, 4, 5)]
# BRICS decomposition
from chiralipy.decompose import brics_decompose
mol = parse_smiles("CCOc1ccc(CC)cc1")
fragments = brics_decompose(mol)
print(sorted(fragments))
# ['[16*]c1ccc([16*])cc1', '[3*]O[3*]', '[4*]CC', '[8*]CC']The BRICS implementation uses an iterative work-queue algorithm that partitions cleavable bonds into two groups:
-
Batch bonds (chain environments like amides
L5, estersL3, linkersL1/L4) are cleaved together in a single pass —$O(V+E)$ per molecule. -
Sequential bonds (ring environments
L13–L16) are cleaved one at a time because breaking a ring bond changes the topology visible to subsequent rules.
The batch/sequential split avoids the combinatorial blowup that a naive "try every subset" approach would hit, while still handling ring-breaking edge cases correctly.
from chiralipy import parse_smiles, canonical_smiles, to_smiles
from chiralipy.rings import find_sssr
from chiralipy.transform import kekulize
from chiralipy.decompose import get_scaffold
# Ring detection (SSSR)
mol = parse_smiles("c1ccc2ccccc2c1") # naphthalene
rings = find_sssr(mol)
print(len(rings)) # 2
# Kekulization
mol = parse_smiles("c1ccccc1") # benzene (aromatic)
kekulized = kekulize(mol)
print(to_smiles(kekulized)) # C1=CC=CC=C1
# Stereochemistry
mol = parse_smiles("C/C=C/C") # trans-2-butene
print(canonical_smiles(mol)) # C/C=C/C
mol = parse_smiles("C[C@H](O)F") # chiral center
print(canonical_smiles(mol)) # C[C@H](O)F
# Murcko scaffold
mol = parse_smiles("Cc1ccc(C(=O)O)cc1") # p-toluic acid
scaffold = get_scaffold(mol)
print(canonical_smiles(scaffold)) # c1ccccc1
# Molecule properties
mol = parse_smiles("CCO") # ethanol
print(mol.num_atoms) # 3
print(len(mol.bonds)) # 2- SMILES/SMARTS parsing with full stereochemistry support
- Canonical SMILES generation
- Substructure matching (RDKit-compatible)
- BRICS decomposition for retrosynthetic fragmentation
- Aromaticity perception based on Hückel's 4n+2 rule
- Ring detection (SSSR algorithm)
- Zero dependencies — pure Python
from chiralipy import parse_smiles, canonical_smiles
from chiralipy.decompose import brics_decompose, get_scaffold
from chiralipy.rings import find_sssr
# Imatinib (Gleevec) - a tyrosine kinase inhibitor
imatinib = "Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C"
mol = parse_smiles(imatinib)
print(canonical_smiles(mol))
# Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1
print(mol.num_atoms) # 37
print(len(find_sssr(mol))) # 5 rings
print(len(brics_decompose(mol))) # 8 fragmentsfrom chiralipy import parse_smiles, canonical_smiles
# Chiral centers - L-alanine vs D-alanine
l_alanine = parse_smiles("C[C@H](N)C(=O)O")
d_alanine = parse_smiles("C[C@@H](N)C(=O)O")
print(canonical_smiles(l_alanine)) # C[C@H](N)C(=O)O
print(canonical_smiles(d_alanine)) # C[C@@H](N)C(=O)O
# E/Z isomers - stilbene
e_stilbene = parse_smiles("c1ccccc1/C=C/c1ccccc1")
z_stilbene = parse_smiles("c1ccccc1/C=C\\c1ccccc1")
print(canonical_smiles(e_stilbene)) # C(=C/c1ccccc1)/c1ccccc1
print(canonical_smiles(z_stilbene)) # C(=C\c1ccccc1)/c1ccccc1from chiralipy import parse_smiles, canonical_smiles
from chiralipy.decompose import get_scaffold
# Extract core scaffold from atorvastatin (Lipitor)
atorvastatin = "CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccc(F)cc2)c(-c2ccccc2)n1CCC(O)CC(O)CC(=O)O"
mol = parse_smiles(atorvastatin)
scaffold = get_scaffold(mol)
print(canonical_smiles(scaffold))
# c1ccc(NCC2CNC(c3ccccc3)C2c2ccccc2)cc1chiralipy produces identical canonical SMILES to RDKit:
# With RDKit
from rdkit import Chem
rdkit_mol = Chem.MolFromSmiles("Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C")
print(Chem.MolToSmiles(rdkit_mol))
# Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1
# With chiralipy
from chiralipy import parse_smiles, canonical_smiles
mol = parse_smiles("Cc1ccc(cc1Nc2nccc(n2)c3cccnc3)NC(=O)c4ccc(cc4)CN5CCN(CC5)C")
print(canonical_smiles(mol))
# Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc(-c2cccnc2)n1 ✓ identical| Feature | RDKit | chiralipy |
|---|---|---|
| Dependencies | C++ binaries | None (pure Python) |
| Installation | Complex | pip install chiralipy |
| Canonical SMILES | ✓ | ✓ (identical output) |
| BRICS decomposition | ✓ | ✓ (regression-tested against RDKit) |
| Substructure search | ✓ | ✓ |
| Ring detection | ✓ | ✓ (SSSR) |
| Stereochemistry | ✓ | ✓ |
| Performance | ~8x faster | Pure Python (with caching) |
Use chiralipy when:
- You need a lightweight, dependency-free solution
- Installing RDKit is difficult (e.g., serverless, embedded systems)
- You want readable, hackable Python code
Use RDKit when:
- Performance is critical for large-scale processing
- You need advanced features (conformer generation, force fields, etc.)
from chiralipy import parse_smiles, canonical_smiles, to_smiles
from chiralipy.match import substructure_search, has_substructure
from chiralipy.decompose import brics_decompose, get_scaffold
from chiralipy.rings import find_sssr
from chiralipy.transform import kekulizeComparison against RDKit (C++ implementation):
Molecule Atoms RDKit ms chiralipy ms Ratio
------------------------------------------------------------
methane 1 0.008 0.030 3.6x
ethanol 3 0.009 0.065 7.4x
benzene 6 0.029 0.229 8.0x
naphthalene 10 0.048 0.422 8.9x
ibuprofen 15 0.057 0.392 6.8x
imatinib 37 0.163 1.290 7.9x
chiralipy caches string inputs with an LRU cache (4096 entries). First calls are slower than RDKit, but repeated calls are ~100-1000x faster:
Molecule Atoms RDKit ms First ms Cached ms First ratio
---------------------------------------------------------------------------
methane 1 0.017 0.075 0.0002 4.3x
ethanol 3 0.019 0.189 0.0002 10.1x
benzene 6 0.049 0.536 0.0002 11.0x
naphthalene 10 0.088 1.093 0.0002 12.5x
ibuprofen 15 0.095 1.010 0.0002 10.7x
imatinib 37 0.260 2.940 0.0002 11.3x
Note: Cached calls are ~100-1000x faster than RDKit for repeated queries
Comparison against RDKit (C++ implementation):
Molecule Atoms Bonds RDKit ms chiralipy ms Ratio
----------------------------------------------------------------------
small_ether 5 4 0.50 2.63 5.2x
medium_drug 11 11 0.81 6.62 8.2x
drug_like 37 41 4.52 39.48 8.7x
complex 39 43 17.02 102.07 6.0x
Average: ~7x slower than RDKit for first calls
MIT