Source code for pygauss.analysis

# -*- coding: utf-8 -*-
from itertools import product, imap
import copy
import math
import string
import multiprocessing
import platform

import numpy as np
import pandas as pd
from pandas.tools.plotting import radviz
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.cluster import KMeans

from IPython.core.display import clear_output

from .molecule import Molecule
from .utils import df_to_img
from .file_io import Folder

[docs]def unpack_and_make_molecule(val_dict):      
    if val_dict.has_key('args'):
        args = val_dict.pop('args')
    else:
        args = []            
    return Molecule(*args, **val_dict)

[docs]class Analysis(object):
    """a class to analyse multiple computations """
    
    def __init__(self, folderpath='', server=None, username=None, passwrd=None, 
                 folder_obj=None, headers=[]):
        """a class to analyse multiple computations

        Parameters
        ----------
        folderpath : str
            the folder directory storing the files to be analysed
        server : str
            the name of the server storing the files to be analysed
        username : str
            the username to connect to the server
        passwrd : str
            server password, if not present it will be asked for during initialisation
        headers : list
            the variable categories for each computation
            
        """  
        self._folder = None  
        if folder_obj:
            self._folder = folder_obj                              
        elif folderpath or server: 
            self.set_folder(folderpath, server, username, passwrd)
            
        heads = headers[:]+['Molecule']
        
        self._df = pd.DataFrame(columns=heads)
        self._df.index.name = 'ID'
        self._next_index = 0
        
    def __repr__(self):
        return self.get_table().to_string()
        
[docs]    def copy(self):
        clone = copy.deepcopy(self)
        return clone        
    
[docs]    def get_folder(self):
        return self._folder
        
[docs]    def set_folder(self, folderpath='', server=None, 
                       username=None, passwrd=None): 
                   
        self._folder = Folder(folderpath, server, username, passwrd)
            
    folder = property(get_folder, set_folder, 
                        doc="The folder for gaussian runs")      

[docs]    def count_runs(self):
        """ get number of runs held in analysis """
        return len(self._df.index)        
        
    def _add_molecule(self, molecule, identifiers):
        """add molecule to internal dataframe """

        identifiers['Molecule'] = molecule
        series = pd.DataFrame(identifiers, 
                              index=[self._next_index])
        self._df = self._df.copy().append(series)
        self._next_index += 1
        
        return True
    
[docs]    def add_run(self, identifiers={}, 
                      init_fname=None, opt_fname=None, 
                      freq_fname=None, nbo_fname=None,
                      alignto=[], atom_groups={},
                      add_if_error=False, folder_obj=None):
        """add single Gaussian run input/outputs """             
        if not folder_obj:
            folder_obj = self._folder
            
        molecule = Molecule(init_fname=init_fname, 
                            opt_fname=opt_fname, 
                            freq_fname=freq_fname, 
                            nbo_fname=nbo_fname,
                            folder_obj=folder_obj,
                            alignto=alignto, atom_groups=atom_groups,
                            fail_silently=True)
        
        num_files = filter(lambda x:x, [init_fname, opt_fname, 
                                        freq_fname, nbo_fname])
        read_errors = molecule.get_init_read_errors()
        if len(read_errors) != num_files and (not read_errors or add_if_error):                
                self._add_molecule(molecule, identifiers)
                
        return molecule.get_init_read_errors() 
             
    def _get_molecules(self, mol_inputs, folder_obj, identifiers, ipython_print=False):
        """ get molecules """

        if folder_obj.islocal() and not platform.system() == 'Windows':    
            pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())
            mapping = pool.imap
        else:
            mapping = imap
        
        with folder_obj:
            molecules=[]
            all_read_errors = []
            for molecule in mapping(unpack_and_make_molecule, mol_inputs):
                molecules.append(molecule)

                read_errors = []
                for typ, fname, msg in molecule.get_init_read_errors():
                    idents = identifiers[len(molecules)-1].copy()
                    idents.pop('Molecule', '_')
                    idents['Type'] = typ
                    idents['File'] = fname
                    idents['Error_Message'] = msg
                    read_errors.append(idents)
                all_read_errors.append(read_errors)

                if ipython_print: 
                    print 'Reading data {0} of {1}'.format(len(molecules), 
                                                           len(mol_inputs))
                    try:
                        clear_output(wait=True)    
                    except:
                        pass

        if folder_obj.islocal() and not platform.system() == 'Windows':
            pool.close()
            pool.join()
                    
        return molecules, all_read_errors                    

[docs]    def add_runs(self, headers=[], values=[], 
                 init_pattern=None, opt_pattern=None, 
                 freq_pattern=None, nbo_pattern=None,
                 add_if_error=False,
                 alignto=[], atom_groups={},
                 ipython_print=False, folder_obj=None):
        """add multiple Gaussian run inputs/outputs """ 
        # set folder oject
        if not folder_obj:
            folder_obj = self._folder

        #get variables for each run
        mol_inputs = []
        identifiers = []
        for idents in product(*values):
            mol_input = {}
            identifiers.append(dict(zip(headers, idents)))
            mol_input['init_fname'] = init_pattern.format(*idents) if init_pattern else None
            if type(opt_pattern) is str:
                mol_input['opt_fname'] = opt_pattern.format(*idents) if opt_pattern else None
            elif type(opt_pattern) is list or type(opt_pattern) is tuple:
                mol_input['opt_fname'] = [o.format(*idents) for o in opt_pattern]
            else:
                mol_input['opt_fname'] = None
            mol_input['freq_fname'] = freq_pattern.format(*idents) if freq_pattern else None
            mol_input['nbo_fname'] = nbo_pattern.format(*idents) if nbo_pattern else None
            
            mol_input['folder_obj'] = folder_obj
            mol_input['alignto'] = alignto 
            mol_input['atom_groups'] = atom_groups
            mol_input['fail_silently'] = True
                            
            mol_inputs.append(mol_input)
            
        #create the molecules
        molecules, read_errors = self._get_molecules(mol_inputs, folder_obj, 
                                                     identifiers, ipython_print)

        #add the molecules to the internal table  
        for molecule, idents, inputs, read_error in zip(molecules, identifiers, 
                                                         mol_inputs, read_errors):
            num_files = filter(lambda x:x, [inputs['init_fname'], inputs['opt_fname'], 
                                            inputs['freq_fname'], inputs['nbo_fname']])
            if read_error != num_files and (not read_error or add_if_error):
                self._add_molecule(molecule, idents)
        
        #collate read errors into a dataframe to return  
        read_errors = filter(len, read_errors)                         
        err_df = pd.DataFrame([item for sublist in read_errors for item in sublist])
        if read_errors:
            cols = err_df.columns.tolist()
            #rearrange columns headers
            cols.remove('Type'); cols.append('Type')
            cols.remove('File'); cols.append('File')
            cols.remove('Error_Message'); cols.append('Error_Message')
            err_df = err_df[cols]
        
        return err_df
                
[docs]    def get_table(self, rows=[], columns=[],  filters={},
                  precision=4, head=False, mol=False, 
                  row_index=[], column_index=[], 
                  as_image=False, na_rep='-', font_size=None,
                  width=None, height=None, unconfined=False):
        """return pandas table of requested data in requested format

        Parameters
        -----------
        rows : integer or list of integers
            select row ids
        columns : string/integer or list of strings/integers
            select column names/positions
        filters : dict
            filter for rows with certain value(s) in specific columns
        precision : int
            decimal precision of displayed values
        head : int
            return only first n rows
        mol : bool
            include column containing the molecule objects
        row_index : string or list of strings
            columns to use as new index        
        column_index : list of strings
            srings to place in to higher order column indexs 
        as_image : bool
            output the table as an image (used pygauss.utils.df_to_img)
        na_rep : str
            how to represent empty (nan) cells (if outputting image)
        width, height, unconfined : int, int, bool
            args for IPy Image
        
        Returns
        -------
        df : pandas.DataFrame
            a table of data            
            
        """
        pd.set_option('precision', precision)
        
        if mol:
            df = self._df.copy()
        else:
            df = self._df.drop('Molecule', axis=1)
        
        for key, val in filters.iteritems():
            if type(val) is list or type(val) is tuple:
                 df = df[getattr(df, key).isin(val)]
            else:
                df = df[getattr(df, key)==val]
            
        if type(rows) is not list and type(rows) is not tuple:
            rows = [rows]
        if type(columns) is not list and type(columns) is not tuple:
            columns = [columns]
        if rows:
            df = df.loc[rows] 
        if columns:
            cols = columns[:]
            if type(row_index) is list:
                cols += row_index
            else:
                cols.append(row_index)
            if mol:
                cols.append('Molecule')
            unique_cols = []
            [unique_cols.append(x) for x in cols if x not in unique_cols]
            df = df.ix[:,unique_cols]            
            
        if row_index: df = df.set_index(row_index) 

        if column_index:
            col_index=[]
            for col in df.columns:
                col_tuple = (' ', col)
                for term in column_index:
                    if len(col)>len(term):
                        if col[:len(term)] == term:
                            col_tuple = (term, col[len(term)+1:])
                            continue
                col_index.append(col_tuple)
            df.columns = pd.MultiIndex.from_tuples(col_index)
            
        if head:
            df = df.head(head)
        
        if as_image:
            return df_to_img(df, na_rep=na_rep, font_size=font_size,
                             width=width, height=height, unconfined=unconfined)            
            
        return df
        
[docs]    def remove_rows(self, rows):
        """remove one or more rows of molecules

        Parameters
        ----------
        rows : int or list of ints:
            the rows to remove
        
        """
        self._df.drop(rows, inplace=True)

        return self.get_table()
        
[docs]    def remove_columns(self, columns):

        self._df.drop(columns, axis=1, inplace=True) 

        return self.get_table()

    _basic_properties={'nbasis':'get_basis_funcs',
                        'basis':'get_basis_descript',
                       'optimised':'is_optimised',
                       'opt_error': 'get_run_error',
                       'conformer': 'is_conformer'}
                       
[docs]    def get_basic_property(self, prop, *args, **kwargs):
        """returns a series of a basic run property or nan if it is not available

        Parameters
        ----------
        prop : str
            can be 'basis', 'nbasis', 'optimised', 'opt_error' or 'conformer'        
        """
        if prop not in self._basic_properties.keys():
            raise ValueError('{0} not a molecule property'.format(prop))
        
        def get_prop(m):
            method = getattr(m, self._basic_properties[prop])
            try: 
                out = method(*args, **kwargs)
            except:
                out = pd.np.nan
            return out
            
        return self._df.Molecule.map(get_prop)

[docs]    def add_basic_properties(self, props=['basis', 'nbasis', 
                                          'optimised', 'conformer']):
        """adds columns giving info of basic run properties """
        for prop in props:
            try:
                series = self.get_basic_property(prop)
            except Exception:
                print 'error reading {0} \n setting to NaN'.format(prop)
                series = pd.np.nan
            self._df[prop.capitalize()] = series  
        
        return self.get_table()
    
[docs]    def remove_non_optimised(self):
        """removes runs that were not optimised """
        non_optimised = self._df[self.get_basic_property('optimised')!=True].copy()
        self._df = self._df[self.get_basic_property('optimised')==True]
        return non_optimised
        
[docs]    def remove_non_conformers(self, cutoff=0.):
        """removes runs with negative frequencies """
        non_conformers = self._df[self.get_basic_property('conformer', cutoff=cutoff)!=True].copy()
        self._df = self._df[self.get_basic_property('conformer', cutoff=cutoff)==True]
        return non_conformers

[docs]    def add_mol_property(self, name, method, *args, **kwargs):
        """compute molecule property for all rows and create a data column 
        
        Parameters
        ----------
        name : str
            what to name the data column
        method : str
            what molecule method to call 
        *args : various
            arguments to pass to the molecule method
        **kwargs : various
            keyword arguments to pass to the molecule method

        """
                
        if type(name) is tuple or type(name) is list:
            for idx, n in enumerate(name):
                func = lambda m: getattr(m, method)(*args, **kwargs)[idx]
                self._df[n] = self._df.Molecule.map(func)
        else:
            func = lambda m: getattr(m, method)(*args, **kwargs)
            self._df[name] = self._df.Molecule.map(func)
        
        return self.get_table()

[docs]    def add_mol_property_subset(self, name, method, 
                                     rows=[], filters={}, 
                                     args=[], kwargs={},
                                     relative_to_rows=[]):
        """compute molecule property for a subset of rows and create/add-to data column 

        Parameters
        ----------
        name : str or list of strings
            name for output column (multiple if method outputs more than one value)
        method : str
            what molecule method to call 
        rows : list
            what molecule rows to calculate the property for
        filters : dict
            filter for selecting molecules to calculate the property for
        args : list
            the arguments to pass to the molecule method
        kwargs : dict
            the keyword arguments to pass to the molecule method
        relative_to_rows: list of ints
            compute values relative to the summated value(s) of molecule at the 
            rows listed
        
        """
        df = self.get_table(rows=rows, filters=filters, mol=True)

        if relative_to_rows:
            rel_df = self.get_table(rows=relative_to_rows, mol=True)

        if type(name) is tuple or type(name) is list:
            
            for idx, n in enumerate(name):
                func = lambda m: getattr(m, method)(*args, **kwargs)[idx]
                vals = df.Molecule.map(func)
                
                if relative_to_rows:
                    rel_val = rel_df.Molecule.map(func).sum()
                    vals = vals - rel_val
                    
                if n in self._df.columns:
                    self._df[n] = vals.combine_first(self._df[n])
                else:                
                    self._df[n] = vals
            
    
        else:
            func = lambda m: getattr(m, method)(*args, **kwargs)
            vals = df.Molecule.map(func)
                
            if relative_to_rows:
                rel_val = rel_df.Molecule.map(func).sum()
                vals = vals - rel_val
                    
            if name in self._df.columns:
                self._df[name] = vals.combine_first(self._df[name])
            else:                
                self._df[name] = vals

        return self.get_table()
            
[docs]    def get_ids(self, variable_names, variable_lists):
        """return ids of a list of unique computations """
        df = self.get_table()
        df['Index'] = df.index
        df.set_index(variable_names, inplace=True)
        df.sortlevel(inplace=True)
        
        ids = []
        for variable_lst in variable_lists:
            df1 = df.copy()
            try:                
                for v in variable_lst:
                    df1 = df1.loc[v]
            except KeyError:
                raise ValueError(
                        'could not find variable set; {}'.format(variable_lst))
            i = df1.Index
            if hasattr(i, 'values'):
                raise ValueError(
                        'variable set is not unique; {}'.format(variable_lst))
            ids.append(int(i))
        return ids

    
[docs]    def get_molecule(self, row):
        """ get molecule object coresponding to particular row """
        return copy.deepcopy(self._df.Molecule.loc[row])
    
    ## TODO will active work?
[docs]    def yield_mol_images(self, rows=[], filters={}, mtype='optimised',
                         sort_columns=[], align_to=[], rotations=[[0., 0., 0.]],
                         gbonds=True, represent='ball_stick', 
                         zoom=1., width=300, height=300, axis_length=0,
                         relative=False, minval=-1, maxval=1,
                         highlight=[], active=False, 
                         sopt_min_energy=20., sopt_cutoff_energy=0.,
                         atom_groups=[], alpha=0.5, transparent=False,
                         hbondwidth=5, eunits='kJmol-1', no_hbonds=False, 
                         ipyimg=True):
        """yields molecules

        Parameters
        ----------
        mtype :
            'initial', 'optimised', 'nbo', 'highlight', 'highlight-initial', 'sopt' or 'hbond'
        info_columns : list of str
            columns to use as info in caption
        max_cols : int
            maximum columns in plot
        label_size : int
            subplot label size (pts)
        start_letter : str
            starting (capital) letter for labelling subplots
        save_fname : str
            name of file, if you wish to save the plot to file
        rows : int or list
            index for the row of each molecule to plot (all plotted if empty)
        filters : dict
            {columns:values} to filter by
        sort_columns : list of str
            columns to sort by
        align_to : [int, int, int]
            align geometries to the plane containing these atoms
        rotations : list of [float, float, float]
            for each rotation set [x,y,z] an image will be produced 
        gbonds : bool
            guess bonds between atoms (via distance)
        represent : str
            representation of molecule ('none', 'wire', 'vdw' or 'ball_stick')
        zoom : float
            zoom level of images
        width : int
            width of original images
        height : int
            height of original images (although width takes precedent)
        axis_length : float
            length of x,y,z axes in negative and positive directions
        relative : bool
            coloring of nbo atoms scaled to min/max values in atom set (for nbo mtype)
        minval : float
            coloring of nbo atoms scaled to absolute min (for nbo mtype)
        maxval : float
            coloring of nbo atoms scaled to absolute max (for nbo mtype)
        highlight : list of lists
            atom indxes to highlight (for highlight mtype)
        eunits : str
            the units of energy to return (for sopt/hbond mtype)
        sopt_min_energy : float
            minimum energy to show (for sopt/hbond mtype)
        sopt_cutoff_energy : float
            energy below which bonds will be dashed (for sopt mtype)
        alpha : float
            alpha color value of geometry (for highlight/sopt/hbond mtypes)
        transparent : bool
            whether atoms should be transparent (for highlight/sopt/hbond mtypes)
        hbondwidth : float   
            width of lines depicting interaction (for hbond mtypes)  
        atom_groups : [list or str, list or str]
            restrict interactions to between two lists (or identifiers) of atom indexes (for sopt/hbond mtypes)
        no_hbonds : bool
            whether to ignore H-Bonds in the calculation
        ipyimg : bool
            whether to return an IPython image, PIL image otherwise 
        
        Yields
        -------
        indx : int
            the row index of the molecule
        mol : IPython.display.Image or PIL.Image
            an image of the molecule in the format specified by ipyimg            
        
        """
        df = self.get_table(columns=['Molecule']+sort_columns, rows=rows, 
                       filters=filters, mol=True)
        
        if sort_columns:
            df.sort(sort_columns, inplace=True)
        
        for indx, mol in zip(df.index, df.Molecule):
            if align_to: 
                align_atoms = mol.get_atom_group(align_to)
                mol.set_alignment_atoms(*align_atoms)
            if mtype == 'initial':
                yield indx, mol.show_initial(gbonds=gbonds, represent=represent, 
                                       rotations=rotations, zoom=zoom, 
                                       width=width, height=height, 
                                       axis_length=axis_length, ipyimg=ipyimg)
            elif mtype == 'optimised':
                yield indx, mol.show_optimisation(gbonds=gbonds, represent=represent, 
                                       rotations=rotations, zoom=zoom, 
                                       width=width, height=height, 
                                       axis_length=axis_length, ipyimg=ipyimg)
            elif mtype == 'nbo':
                yield indx, mol.show_nbo_charges(gbonds=gbonds, represent=represent, 
                                       rotations=rotations, zoom=zoom, 
                                       width=width, height=height, 
                                       axis_length=axis_length,
                                       relative=relative, 
                                       minval=minval, maxval=maxval, ipyimg=ipyimg)
            elif mtype == 'highlight':
                yield indx, mol.show_highlight_atoms(highlight, 
                                       alpha=alpha, optimised=True,
                                       transparent=transparent,
                                       gbonds=gbonds, 
                                       represent=represent, 
                                       rotations=rotations, zoom=zoom, 
                                       width=width, height=height, 
                                       axis_length=axis_length, ipyimg=ipyimg)
            elif mtype == 'highlight-initial':
                yield indx, mol.show_highlight_atoms(highlight, 
                                       alpha=alpha, optimised=False,
                                       transparent=transparent,
                                       gbonds=gbonds, 
                                       represent=represent, 
                                       rotations=rotations, zoom=zoom, 
                                       width=width, height=height, 
                                       axis_length=axis_length, ipyimg=ipyimg)
            elif mtype == 'sopt':
                yield indx, mol.show_sopt_bonds(min_energy=sopt_min_energy,
                                    cutoff_energy=sopt_cutoff_energy, no_hbonds=no_hbonds, 
                                    eunits=eunits, atom_groups=atom_groups,
                                    alpha=alpha, transparent=transparent,
                                    gbonds=gbonds, represent=represent, 
                                    rotations=rotations, zoom=zoom, 
                                    width=width, height=height, 
                                    axis_length=axis_length,
                                    relative=relative, 
                                    minval=minval, maxval=maxval, ipyimg=ipyimg)
            elif mtype == 'hbond':
                yield indx, mol.show_hbond_analysis(min_energy=sopt_min_energy,
                                    cutoff_energy=sopt_cutoff_energy, eunits=eunits,
                                    atom_groups=atom_groups, bondwidth=hbondwidth,
                                    alpha=alpha, transparent=transparent,
                                    gbonds=gbonds, represent=represent, 
                                    rotations=rotations, zoom=zoom, 
                                    width=width, height=height, 
                                    axis_length=axis_length,
                                    relative=relative, 
                                    minval=minval, maxval=maxval, ipyimg=ipyimg)
            else:
                raise ValueError(
                'mtype must be initial, optimised, nbo, highlight, highligh-initial, sopt or hbond')                
    
    def _get_letter(self, number):
        """get an uppercase letter according to a number"""
        if number < 26:
            return string.ascii_uppercase[number]
        else:
            first_letter = string.ascii_uppercase[int(number/26)-1]
            second_letter = string.ascii_uppercase[number % 26]
            return first_letter + second_letter
        
[docs]    def plot_mol_images(self, mtype='optimised', max_cols=1, padding=(1, 1),
                        sort_columns=[], info_columns=[], info_incl_id=False,
                        label_size=20, letter_prefix='', start_letter='A',
                        rows=[], filters={}, align_to=[], rotations=[[0., 0., 0.]],
                        gbonds=True, represent='ball_stick',
                        zoom=1., width=500, height=500, axis_length=0,
                        relative=False, minval=-1, maxval=1,
                        highlight=[], frame_on=False, eunits='kJmol-1',
                        sopt_min_energy=20., sopt_cutoff_energy=0.,
                        atom_groups=[], alpha=0.5, transparent=False,
                        hbondwidth=5, no_hbonds=False):
        """show molecules in matplotlib table of axes

        Parameters
        ----------
        mtype :
            'initial', 'optimised', 'nbo', 'highlight', 'highlight-initial', 'sopt' or 'hbond'
        max_cols : int
            maximum columns in plot
        padding: tuple
            padding between images (horizontally, vertically)
        sort_columns : list of str
            columns to sort by
        info_columns : list of str
            columns to use as info in caption
        info_incl_id : bool
            include molecule id number in caption
        label_size : int
            subplot label size (pts)
        letter_prefix : str
            prefix for labelling subplots
        start_letter : str
            starting (capital) letter for labelling subplots
        rows : int or list
            index for the row of each molecule to plot (all plotted if empty)
        filters : dict
            {columns:values} to filter by
        align_to : [int, int, int]
            align geometries to the plane containing these atoms
        rotations : list of [float, float, float]
            for each rotation set [x,y,z] an image will be produced 
        gbonds : bool
            guess bonds between atoms (via distance)
        represent : str
            representation of molecule ('none', 'wire', 'vdw' or 'ball_stick')
        zoom : float
            zoom level of images
        width : int
            width of original images
        height : int
            height of original images (although width takes precedent)
        axis_length : float
            length of x,y,z axes in negative and positive directions
        relative : bool
            coloring of nbo atoms scaled to min/max values in atom set (for nbo mtype)
        minval : float
            coloring of nbo atoms scaled to absolute min (for nbo mtype)
        maxval : float
            coloring of nbo atoms scaled to absolute max (for nbo mtype)
        highlight : list of lists
            atom indxes to highlight (for highlight mtype)
        eunits : str
            the units of energy to return (for sopt/hbond mtype)
        sopt_min_energy : float
            minimum energy to show (for sopt/hbond mtype)
        sopt_cutoff_energy : float
            energy below which bonds will be dashed (for sopt mtype)
        alpha : float
            alpha color value of geometry (for sopt/hbond mtypes)
        transparent : bool
            whether atoms should be transparent (for sopt/hbond mtypes)
        hbondwidth : float   
            width of lines depicting interaction (for hbond mtypes)     
        atom_groups : [list or str, list or str]
            restrict interactions to between two lists (or identifiers) of atom indexes (for sopt/hbond mtypes)
        no_hbonds : bool
            whether to ignore H-Bonds in the calculation (for sopt only)
        frame_on : bool
            whether to show frame around each image 

        Returns
        -------
        fig : matplotlib.figure.Figure
            A figure containing subplots for each molecule image
        caption : str
            A caption describing each subplot, given info_columns
        
        """
        letter_offset = string.ascii_uppercase.find(start_letter)
        if letter_offset == -1:
            raise ValueError('start_letter must be an uppercase single letter')
        
        df = self.get_table(rows=rows, columns=info_columns, filters=filters)
        num_mols = len(df)
        
        imgs = self.yield_mol_images(rows=rows, filters=filters, mtype=mtype,
                        align_to=align_to, gbonds=gbonds, represent=represent, 
                        sort_columns=sort_columns, rotations=rotations, zoom=zoom, 
                        width=width, height=height, axis_length=axis_length,
                        relative=relative, minval=minval, maxval=maxval,
                        highlight=highlight, active=False, ipyimg=False,
                        eunits=eunits, sopt_min_energy=sopt_min_energy, 
                        sopt_cutoff_energy=sopt_cutoff_energy,
                        atom_groups=atom_groups, alpha=alpha, 
                        transparent=transparent,
                        hbondwidth=hbondwidth, no_hbonds=no_hbonds)
        
        #num_rows = int(math.ceil(num_mols/float(max_cols)))
        #num_cols = min([max_cols, num_mols])
        num_cols=int(max_cols)
        num_rows=int(math.ceil(num_mols/float(num_cols)))
 
        fig, axes = plt.subplots(num_rows, num_cols, squeeze=False,
                                 gridspec_kw={'width_ratios':[1]*num_cols})
                                 
        for ax in fig.get_axes():
            ax.axes.get_xaxis().set_visible(False)
            ax.axes.get_yaxis().set_visible(False)
            ax.set_anchor('NW')
            ax.set_frame_on(False)

        mol_num = 0
        caption = []                       
        for indx, img in imgs:

            ax = axes[int(math.ceil((mol_num+1)/float(max_cols)))-1,
                      mol_num % max_cols]
            ax.imshow(img)#, aspect='equal')
            ax.set_frame_on(frame_on)

            if label_size:
                ax.text(0,0.8,letter_prefix+self._get_letter(mol_num+letter_offset), 
                        size=label_size, weight="bold")
            
            info = ', '.join(df[info_columns].loc[indx].fillna('-').astype(str))
            if info_incl_id:
                info = str(indx) + ', ' + info
                                
            caption.append(
                '(' + letter_prefix+self._get_letter(mol_num+letter_offset) + ') ' + info)
            
            mol_num += 1            

        #resize extra axes to be same as last img
        while mol_num < num_rows*num_cols:
            ax = axes[int(math.ceil((mol_num+1)/float(max_cols)))-1,
                      mol_num % max_cols]
            ax.imshow(img)
            ax.clear()
            mol_num += 1               

        fig.tight_layout(w_pad=padding[0], h_pad=padding[1])
                
        caption = ', '.join(caption)
        #insert newline character every 80 charaters
        #caption = re.sub("(.{80})", "\\1\n", caption, 0, re.DOTALL)
        
        return fig, caption
        
                
[docs]    def plot_mol_graphs(self, gtype='energy', share_plot=False, max_cols=1, 
                    padding=(1,1), tick_rotation=0,
                    rows=[], filters={}, sort_columns=[], info_columns=[], 
                    info_incl_id=False, letter_prefix='', start_letter='A',
                    grid=True, sharex=True, sharey=True, legend_size=10,
                    color_scheme='jet', eunits='eV',
                    per_energy=1., lbound=None, ubound=None,
                    color_homo='g', color_lumo='r', 
                    homo_lumo_lines=True,homo_lumo_values=True,band_gap_value=True):
        """get a set of data plots for each molecule
        
        Parameters
        ----------
        gtype : str
            the type of plot, 
            energy = optimisation energies, 
            freq = frequency analsis,
            dos = Densty of States,
        share_plot : bool
            whether to plot all data on the same or separate axes
        max_cols : int
            maximum columns on plots (share_plot=False only)
        padding: tuple
            padding between images (horizontally, vertically)
        tick_rotation : int
            rotation of x-axis labels
        rows : int or list
            index for the row of each molecule to plot (all plotted if empty)
        filters : dict
            {columns:values} to filter by
        sort_columns : list of str
            columns to sort by
        info_columns : list of str
            columns to use as info in caption
        info_incl_id : bool
            include molecule id number in labels
        letter_prefix : str
            prefix for labelling subplots (share_plot=False only)
        start_letter : str
            starting (capital) letter for labelling subplots (share_plot=False only)
        grid : bool
            whether to include a grid in the axes
        sharex : bool
            whether to align x-axes (share_plot=False only)
        sharey : bool
            whether to align y-axes (share_plot=False only)
        legend_size : int
            the font size (in pts) for the legend
        color_scheme : str
            the scheme to use for each molecule (share_plot=True only)
            according to http://matplotlib.org/examples/color/colormaps_reference.html 
        eunits : str
            the units of energy to use
        per_energy : float
            energy interval to group states by (DoS only)
        lbound : float
            lower bound energy (DoS only)
        ubound: float
            upper bound energy (DoS only)
        color_homo : matplotlib.colors
            color of homo in matplotlib format
        color_lumo : matplotlib.colors
            color of lumo in matplotlib.colors
        homo_lumo_lines : bool
            draw lines at HOMO and LUMO energies
        homo_lumo_values : bool
            annotate HOMO and LUMO lines with exact energy values
        band_gap_value : bool
            annotate inbetween HOMO and LUMO lines with band gap value
        
        Returns
        -------
        data : matplotlib.figure.Figure
            plotted frequency data
        caption : str
            A caption describing each subplot, given info_columns
            
        """
        df = self.get_table(columns=list(set(info_columns+sort_columns)), 
                            rows=rows, 
                            filters=filters, mol=True)
        num_plots = df.index.shape[0]
        
        if sort_columns:
            df.sort(sort_columns, inplace=True)        

        if gtype == 'energy':
            mol_func = 'plot_opt_energy'
            x_label = 'Optimisation Step'
            y_label = 'Energy ({0})'.format(eunits)
            all_plot_kwargs = {'units':eunits}
            per_plot_kwargs = {'linecolor':getattr(cm,color_scheme)(
                                            np.linspace(0.1, 0.9, num_plots))}
        elif gtype == 'freq':
            mol_func = 'plot_freq_analysis'
            x_label = 'Frequency ($cm^{-1}$)'
            y_label = 'IR Intensity ($km/mol$)' 
            all_plot_kwargs = {}
            per_plot_kwargs = {'color':getattr(cm,color_scheme)(
                                                np.linspace(0, 1, num_plots)),
                               'alpha':np.linspace(1, 0.5, num_plots),
                                'marker_size':np.linspace(25, 15, num_plots)}
        elif gtype == 'dos':
            if share_plot:
                raise ValueError('share_plots not available for Density of States')
            mol_func = 'plot_dos'
            x_label = 'Density of States (per {0} {1})'.format(per_energy, eunits)
            y_label = 'Energy ({})'.format(eunits)
            all_plot_kwargs = {'eunits':eunits, 'per_energy':per_energy, 
                               'lbound':lbound, 'ubound':ubound,
                 'color_homo':color_homo, 'color_lumo':color_lumo, 
                 'homo_lumo_lines':homo_lumo_lines, 'homo_lumo_values':homo_lumo_values,
                 'band_gap_value':band_gap_value, 'legend_size':legend_size}
        else:
            raise ValueError('gtype; {0}, not available'.format(gtype))

        ax_num = 0
        caption = []  

        if share_plot:
            fig, ax = plt.subplots()
            
            legend = []
            for indx, row in df.iterrows():
                plot_kwargs = all_plot_kwargs.copy()
                for k, v in per_plot_kwargs.iteritems():
                    plot_kwargs[k] = v[ax_num]
                getattr(row.Molecule, mol_func)(ax=ax, **plot_kwargs)

                label = ', '.join(row[info_columns].fillna('-').astype(str))
                if info_incl_id:
                    label = str(indx) + ', ' + label
                legend.append(label)
                
                ax_num += 1

            ax.grid(grid)
            for tick in ax.get_xticklabels():
                tick.set_rotation(tick_rotation)
            ax.legend(legend, bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.,
                      prop={'size':legend_size})
        else:
        
            num_rows = int(math.ceil(num_plots/float(max_cols)))
            num_cols = min([max_cols, num_plots])
            fig, axes = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False,
                                     sharex=sharex, sharey=sharey)

            letter_offset = string.ascii_uppercase.find(start_letter)
            if letter_offset == -1:
                raise ValueError('start_letter must be an uppercase single letter')

            for indx, row in df.iterrows():
                i = int(math.ceil((ax_num+1)/float(max_cols)))-1
                j = ax_num % max_cols               
                
                getattr(row.Molecule, mol_func)(ax=axes[i,j], **all_plot_kwargs) 
                axes[i,j].grid(grid)
                for tick in axes[i,j].get_xticklabels():
                    tick.set_rotation(tick_rotation)

                info = ', '.join(row[info_columns].fillna('-').astype(str))
                if info_incl_id:
                    info = str(indx) + ', ' + info
                letter = self._get_letter(ax_num+letter_offset)
                axes[i,j].set_title(letter_prefix+letter, fontweight="bold")
                    
                caption.append('(' + letter_prefix+letter + ') ' + info)
                
                ax_num += 1
            
            #hide extraneous axes
            for extra_ax in range(ax_num, num_rows*num_cols):
                i = int(math.ceil((extra_ax+1)/float(max_cols)))-1
                j = extra_ax % max_cols  
                axes[i,j].axis('off')


            ax = fig.add_subplot(111)    # The big subplot
            ax.tick_params(top='off', bottom='off', left='off', right='off',
                           labelbottom='on', labelleft='on', pad=25)
            
            ax.set_xticklabels([])
            ax.set_yticklabels([])           
            ax.set_frame_on(False)
        
        ax.set_xlabel(x_label)  
        ax.set_ylabel(y_label)
        
        fig.tight_layout(w_pad=padding[0], h_pad=padding[1])
        
        caption = ', '.join(caption)
        
        return fig, caption
        
        
[docs]    def plot_radviz_comparison(self, category_column, 
                               columns=[], rows=[], filters={}, point_size=30,
                                **kwargs):
        """return plot axis of radviz graph
        
        RadViz is a way of visualizing multi-variate data. 
        It is based on a simple spring tension minimization algorithm. 
        Basically you set up a bunch of points in a plane. In our case they are 
        equally spaced on a unit circle. Each point represents a single attribute. 
        You then pretend that each sample in the data set is attached to each 
        of these points by a spring, the stiffness of which is proportional to 
        the numerical value of that attribute (they are normalized to unit 
        interval). The point in the plane, where our sample settles to (where 
        the forces acting on our sample are at an equilibrium) is where a dot 
        representing our sample will be drawn. Depending on which class that 
        sample belongs it will be colored differently.
        """
        col_names = self._df.drop('Molecule', axis=1).columns.tolist()
        if category_column not in col_names:
            raise ValueError('{0} not in columns'.format(category_column))
        
        columns = columns[:]
        if columns and category_column not in columns:
            if all(isinstance(item, int) for item in columns):
                columns.append(col_names.index(category_column))
            else:
                columns.append(category_column) 
            
        df = self.get_table(rows, columns, filters)
        df = df.sort(category_column)

        f, ax = plt.subplots()
        ax = radviz(df, category_column, ax=ax, s=point_size, **kwargs)
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.set_frame_on(False)
        ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        
        return ax
    
[docs]    def calc_kmean_groups(self, category_column, category_name, groups,
                    columns=[], rows=[], filters={}):
        """calculate the kmeans grouping of rows 
        
        The KMeans algorithm clusters data by trying to separate samples in n 
        groups of equal variance, minimizing a criterion known as the inertia 
        or within-cluster sum-of-squares. This algorithm requires the number of 
        clusters to be specified. It scales well to large number of samples and 
        has been used across a large range of application areas in many 
        different fields.
        """
        col_names = self._df.drop('Molecule', axis=1).columns.tolist()
        if category_column not in col_names:
            raise ValueError('{0} not in columns'.format(category_column))

        filters[category_column] = category_name
        df = self.get_table(rows, columns, filters)
        
        k_means = KMeans(n_clusters=groups)
        k_means.fit(df)
        cats = k_means.predict(df)
        
        return pd.DataFrame({'Name':category_name, 'Category':cats}, 
                            index=df.index)
                
if __name__ == '__main__':
    pass