Source code for graphdot.dataset.qm7

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import scipy.io
import pandas as pd
from ase import Atoms
from ._get import get


[docs]def QM7(
    download_url='http://quantum-machine.org/data/qm7.mat',
    local_filename='qm7.mat', overwrite=False, ase=False
):
    '''A 7165-molecule subset of the GDB-13 dataset. Molecules have up to 23
    total atoms and 7 heavy atoms. Atomization energies are computed at the
    Perdew-Burke-Ernzerhof hybrid functional (PBE0) level.

    References:
    - L. C. Blum, J.-L. Reymond, 970 Million Druglike Small Molecules for
    Virtual Screening in the Chemical Universe Database GDB-13,
    J. Am. Chem. Soc., 131:8732, 2009.
    - M. Rupp, A. Tkatchenko, K.-R. Müller, O. A. von Lilienfeld: Fast and
    Accurate Modeling of Molecular Atomization Energies with Machine
    Learning, Physical Review Letters, 108(5):058301, 2012

    Parameters
    ----------
    download_url: str
        URL to download the qm7.mat data file.
    local_filename: str
        Name for local storage of the data file.
    overwrite: bool
        Whether or not to overwrite the local file if one already exists.
    ase: bool
        Whether to create ASE Atoms objects from the dataset.

    Returns
    -------
    qm7: DataFrame
        A dataframe containing the data from QM7.
    '''
    try:
        mat = scipy.io.loadmat(
            get(download_url, local_filename, overwrite=overwrite)
        )
    except Exception as e:
        raise RuntimeError(
            f'Loading {local_filename} failed due to error: {e}.'
        )

    def _as_objects(array):
        out = np.empty(len(array), dtype=np.object)
        for i, element in enumerate(array):
            out[i] = element
        return out

    qm7 = pd.DataFrame(data=dict(
        columb_matrix=_as_objects(mat['X']),
        atomization_energy=mat['T'].ravel().astype(np.float),
        atomic_charge=_as_objects(mat['Z']),
        xyz=_as_objects(mat['R']),
        split=np.zeros(7165, dtype=np.int)
    ))

    for i, s in enumerate(mat['P']):
        qm7.loc[s, 'split'] = i

    if ase is True:
        qm7['atoms'] = qm7.apply(
            lambda row: Atoms(
                row.atomic_charge[row.atomic_charge != 0],
                row.xyz[row.atomic_charge != 0]
            ),
            axis=1
        )

    return qm7