Source code for graphdot.dataset.qm9

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import io
import tarfile
import numpy as np
import pandas as pd
from ase import Atoms
from tqdm import tqdm
from ._get import get


[docs]def QM9( download_url='https://ndownloader.figshare.com/files/3195389', local_filename='dsgdb9nsd.xyz.tar.bz2', overwrite=False, ase=False ): '''Quantum chemistry structures and properties of 134 kilo molecules. References: - Ramakrishnan, Raghunathan, et al. "Quantum chemistry structures and properties of 134 kilo molecules." Scientific data 1.1 (2014): 1-7. Parameters ---------- download_url: str URL to download the qm7.mat data file. local_filename: str Name for local storage of the data file. overwrite: bool Whether or not to overwrite the local file if one already exists. ase: bool Whether to create ASE Atoms objects from the dataset. Returns ------- qm9: DataFrame A dataframe containing the data from QM9. ''' try: f = get(download_url, local_filename) except Exception as e: raise RuntimeError( f'Acquiring {local_filename} failed due to error: {e}.' ) data = [] tf = tarfile.open(f, 'r:bz2') for xyz in tqdm(tf, total=133885): content = io.TextIOWrapper(tf.extractfile(xyz)).read() content = content.replace('*^', 'E') lines = content.split('\n') n_atoms = int(lines[0]) fields = lines[1][4:].strip().split('\t') symbols, x, y, z, charges = zip(*[line.split('\t') for line in lines[2:n_atoms + 2]]) data.append(tuple( # scalar properties [int(fields[0])] + [float(w) for w in fields[1:]] + # atomic coordinates [symbols, np.array([x, y, z]).T.tolist(), charges] + # vibrational frequencies [list(map(float, lines[n_atoms + 2].strip().split('\t')))] + # SMILES lines[n_atoms + 3].strip().split('\t') + # InChI lines[n_atoms + 4].strip().split('\t') )) qm9 = pd.DataFrame( data, columns=[ 'id', 'A', 'B', 'C', 'mu', 'alpha', 'e_HOMO', 'e_LUMO', 'e_gap', 'R2', 'zpve', 'U0', 'U', 'H', 'G', 'Cv', 'symbols', 'xyz', 'charges_mulliken', 'freq', 'smiles_gdb', 'smiles_opt', 'inchi_gdb', 'inchi_opt' ] ) if ase is True: qm9['atoms'] = qm9.apply( lambda row: Atoms( symbols=row.symbols, positions=row.xyz, charges=row.charges_mulliken ), axis=1 ) return qm9