Input / Output

Reading and writing molecules and reactions in all supported formats.

String Parsers

SMILES

from chython import smiles

mol = smiles('CCO')          # ethanol
mol = smiles('c1ccccc1')     # benzene (aromatic)
mol = smiles('[Cu+2]')       # copper ion
mol = smiles('C/C=C/C')     # trans-2-butene (with stereo)

# Reaction SMILES
rxn = smiles('[CH3:1][OH:2]>>[CH3:1][NH2:3]')

InChI

from chython import inchi

mol = inchi('InChI=1S/C2H6O/c1-2-3/h3H,2H2,1H3')

IUPAC Name

Requires OPSIN JAR. Set path via OPSIN_PATH env variable or chython.class_paths[1].

from chython import iupac

mol = iupac('acetic acid')
mol = iupac('2-acetoxybenzoic acid')

MDL MOL Block

from chython import mdl_mol

mol_block = """
  Mrv2211 03232310102D

  3  2  0  0  0  0            999 V2000
    0.0000    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    1.5400    0.0000    0.0000 C   0  0  0  0  0  0  0  0  0  0  0  0
    3.0800    0.0000    0.0000 O   0  0  0  0  0  0  0  0  0  0  0  0
  1  2  1  0  0  0  0
  2  3  1  0  0  0  0
M  END
"""
mol = mdl_mol(mol_block)

XYZ Coordinates

from chython import xyz

mol = xyz((('O', 0., 0., 0.), ('H', 1., 0., 0.), ('H', 0., 1., 0.)))
mol.clean2d()

SMARTS

Parses SMARTS into QueryContainer for substructure matching. See Substructure Search & Fingerprints for full SMARTS syntax.

from chython import smarts

q = smarts('[C;r5,r6;a]-;!@[C;h0,h1]')
print(q)  # canonical atom order

File Readers

SDF / RDF

from chython import SDFRead, RDFRead

# Iterate molecules from SDF
with SDFRead('molecules.sdf') as reader:
    for mol in reader:
        print(mol.name, mol.meta)

# Iterate reactions from RDF
with RDFRead('reactions.rdf') as reader:
    for rxn in reader:
        print(rxn)

# Read all at once
with SDFRead('molecules.sdf') as reader:
    mols = reader.read()

# Read a limited batch
with SDFRead('molecules.sdf') as reader:
    first_100 = reader.read(amount=100)

# Generator: get first record, then read rest
with RDFRead('reactions.rdf') as f:
    first = next(f)
    rest = f.read()

# Indexed random access (Unix only)
with SDFRead('molecules.sdf', indexable=True) as reader:
    reader.reset_index()
    mol = reader[42]
    total = len(reader)

Pathlib supported:

from pathlib import Path

with RDFRead(Path('reactions.rdf')) as r:
    rxn = next(r)

Opened file objects supported (text mode for all formats except MRV):

with open('reactions.rdf') as f, RDFRead(f) as r:
    rxn = next(r)

MRV

MRV files require binary mode:

from chython import MRVRead

with MRVRead(open('structures.mrv', 'rb')) as reader:
    for mol in reader:
        print(mol)

Reading from Archives and Network

Readers accept any file-like object, enabling transparent reading from compressed sources:

# gzip
from gzip import open as gzip_open
with gzip_open('data.rdf.gz', 'rt') as f, RDFRead(f) as r:
    rxn = next(r)

# zip
from zipfile import ZipFile
from io import TextIOWrapper
with ZipFile('data.zip') as z, z.open('data.rdf') as c:
    with TextIOWrapper(c) as f, RDFRead(f) as r:
        rxn = next(r)

# tar.gz
from tarfile import open as tar_open
with tar_open('data.tar.gz') as t:
    c = t.extractfile('data.rdf')
    with TextIOWrapper(c) as f, RDFRead(f) as r:
        rxn = next(r)

# URL via requests
from requests import get
from io import StringIO
with StringIO(get('https://example.com/data.rdf').text) as f, RDFRead(f) as r:
    rxn = next(r)

Other Readers

All readers share the same API (iteration, .read(), context manager).

  • SDFRead - MOL/SDF (V2000, V3000)

  • RDFRead - RXN/RDF

  • MRVRead - ChemAxon MRV (requires binary mode)

  • PDBRead - PDB format (explicit hydrogens only)

For SMILES, InChI and XYZ, use the string parsers (smiles(), inchi(), xyz()) directly. To process files with one record per line, iterate lines manually:

from chython import smiles

with open('molecules.smi') as f:
    for line in f:
        mol = smiles(line)
        print(mol)

Reader Options

MDL readers (SDFRead, RDFRead) accept these options:

with SDFRead('molecules.sdf',
             ignore=True,              # try to fix/skip errors (default)
             remap=False,              # renumber atoms from 1
             ignore_stereo=False,      # discard stereochemistry
             ignore_bad_isotopes=False, # reset invalid isotopes
             calc_cis_trans=False,      # recalculate cis/trans from 2D
             ) as reader:
    for mol in reader:
        pass

File Writers

SDF / RDF

from chython import SDFWrite, RDFWrite, ESDFWrite, ERDFWrite

# Write molecules to SDF (V2000)
with SDFWrite('output.sdf') as writer:
    writer.write(mol)

# V3000 extended format
with ESDFWrite('output_v3000.sdf') as writer:
    writer.write(mol)

# Write reactions to RDF (V2000)
with RDFWrite('output.rdf') as writer:
    writer.write(rxn)

# V3000 reactions
with ERDFWrite('output_v3000.rdf') as writer:
    writer.write(rxn)

# Append mode
with SDFWrite('output.sdf', append=True) as writer:
    writer.write(mol)

# Write with 3D coordinates (conformer index)
with SDFWrite('output_3d.sdf') as writer:
    writer.write(mol, write3d=0)

# Ongoing writing without context manager
f = RDFWrite('output.rdf')
for rxn in data:
    f.write(rxn)
f.close()

MRV

from chython import MRVWrite

with MRVWrite('output.mrv') as writer:
    writer.write(mol)

SMILES Strings

from chython import smiles

mol = smiles('CCO')

# Canonical SMILES
s = str(mol)           # or format(mol)

# Format specifiers
format(mol, 'm')   # include atom mapping numbers
format(mol, 'h')   # show implicit hydrogens
format(mol, 'r')   # random SMILES (non-canonical)
format(mol, 'a')   # asymmetric closures
format(mol, 'A')   # aromatic bonds (: notation) instead of lowercase atoms
format(mol, '!s')  # without stereo
format(mol, '!x')  # without CXSMILES extensions
format(mol, '!z')  # without charges
format(mol, '!b')  # without bond tokens
format(mol, 'mh')  # combine multiple: mapping + hydrogens
format(mol, 'h!b') # implicit H, no bond tokens

# Works with f-strings, %-formatting, .format()
print(f'{mol:A}')
print('smiles: %s' % mol)

Serialization

Pickle

Full pickle support for all containers. Faster than file formats for temporary storage:

from pickle import loads, dumps

data = dumps(mol)
mol = loads(data)

# Works for reactions too
data = dumps(rxn)
rxn = loads(data)

Chython Binary Pack

Compact binary format. Stores 2D coordinates, stereo, charges, isotopes, radicals, atom numbers. Size ~1.5-2x larger than SMILES. Parsing faster than pickle.

from chython import MoleculeContainer, ReactionContainer, unpack

# Pack to bytes (zlib compressed by default)
data = mol.pack()
data = bytes(mol)          # same as pack()

# Unpack (auto-detects molecule or reaction)
restored = unpack(data)

# Or unpack with explicit type
restored = MoleculeContainer.unpack(data)

# Uncompressed
data = mol.pack(compressed=False)
restored = unpack(data, compressed=False)

# Peek at atom count without unpacking
count = MoleculeContainer.pack_len(data, compressed=False)

# Reactions
rxn_data = rxn.pack()
rxn_restored = ReactionContainer.unpack(rxn_data)

Metadata

SDF/RDF files store metadata in molecule and reaction objects:

rxn = next(RDFRead('reactions.rdf'))
rxn.meta           # dict of DTYPE/DATUM fields
rxn.name           # reaction title from RDF

mol = rxn.reactants[0]
mol.name           # molecule title from MOL block
mol.meta           # molecule metadata dict

# Set metadata for writing
mol.name = 'Ethanol'
mol.meta['boiling_point'] = '78.37'