Source code for pygauss.docs

# -*- coding: utf-8 -*-
"""
Created on Tue Jun 16 15:52:53 2015

@author: chris sewell
"""
from math import log10, floor
from io import BytesIO
import re

from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_ORIENT
from docx.shared import Cm

from pandas import DataFrame
import numpy as np

[docs]class MSDocument(object):
    """a class to output a Microsoft Word Document
    
    NB: docx.api.Document can't be directly inherited as it is a function which 
    returns various classes dependent on the *docx* parameter 
    """
    def __init__(self, docx=None):
        """a class to output a Microsoft Word Document

        inherited api details for :py:class:`docx.document.Document` can be 
        found at; https://python-docx.readthedocs.org/en/latest/api/document.html 
        
        the class has an internal state for the number of calls to add_picture
        and add_table for use in caption numbering
        
        Parameters
        ----------
        docx : str or file-like object
            can be either a path to a .docx file (a string) or a file-like object. 
            If docx is missing or None, the built-in default document “template” 
            is loaded.
        
        """
        self._docx = Document(docx=docx)
        
        self._piccount = 0
        self._tablecount = 0
    
[docs]    def __getattr__(self, name):
        """ required to get :py:class:`docx.document.Document` methods """
        return getattr(self._docx, name)
    
[docs]    def __dir__(self):
        """ required to have :py:class:`docx.document.Document` methods in 
        :py:mod:`IPython` tab completion"""
        dirlist = self.__class__.__dict__.keys() + self._docx.__class__.__dict__.keys()
        return sorted(dirlist)           

[docs]    def add_picture(self, image_path_or_stream, width=None, height=None):
        """ Return a new picture shape added in its own paragraph at the end of
        the document. The picture contains the image at
        *image_path_or_stream*, scaled based on *width* and *height*. If
        neither width nor height is specified, the picture appears at its
        native size. If only one is specified, it is used to compute
        a scaling factor that is then applied to the unspecified dimension,
        preserving the aspect ratio of the image. The native size of the
        picture is calculated using the dots-per-inch (dpi) value specified
        in the image file, defaulting to 72 dpi if no value is specified, as
        is often the case.
        """
        self._piccount += 1
        return self._docx.add_picture(image_path_or_stream, width, height)
    
[docs]    def add_table(self, rows, cols, style=None):
        """Add a table having row and column counts of *rows* and *cols*
        respectively and table style of *style*. *style* may be a paragraph
        style object or a paragraph style name. If *style* is |None|, the
        table inherits the default table style of the document.
        """
        self._tablecount += 1
        return self._docx.add_table(rows, cols, style)

    _MARKUPS = {
            'italic':('*','*'),
            'bold':('**', '**'),
            'subscript':('_{', '}'),
            'superscript':('^{', '}'),
            'strike':('~~','~~'),
            'math': ('$', '$')
            }

    def _get_markup(self, para, markup_dict=None):
        """get markup """
        if not markup_dict:
            markup_dict = self._MARKUPS
            
        df = DataFrame(markup_dict, index=['Enter', 'Exit']).T
        df['In']=False
        
        sects=[]
        place=0
        while place > -1:
            place = -1
            markup = None
            estr = None
            for mark, enter in df[df.In==False].Enter.iterkv():
                find = para.find(enter)
                if find > -1 and (find<=place or place==-1):
                    if find == place and len(enter) < len(estr):
                        continue
                    place = find
                    markup = mark
                    estr = enter
            for mark, exit in df[df.In==True].Exit.iterkv():
                find = para.find(exit)
                if find > -1 and (find<=place or place==-1):
                    if find == place and len(exit) < len(estr):
                        continue
                    place = find
                    markup = mark
                    estr = exit
        
            if place > -1:
                sects.append([para[:place], df[df.In==True].index.tolist()])
                df.set_value(markup, 'In', not df.get_value(markup, 'In'))
                para = para[place+len(estr):]

        if df.In.any():
            raise ValueError(
                'the markup does not exit from;\n{}'.format(df[df.In==True]))
            
        sects.append([para, []])
                         
        return sects

[docs]    def add_markdown(self, text='', style='Body Text', 
                     markup_dict=None, para=None):
        r"""adds a paragraph to the document, allowing for
        paragraph/font styling akin to a stripped down version of
        markdown text:
        
        paragraph level::
        
            # Header (level denoted by number of #'s)            
            - bullet list
            1. numbered list  
        
        font level::
        
            **bold** 
            *italic* 
            _{subscript} 
            ^{superscript} 
            ~~strikethrough~~ 
            $mathML$
        
        Parameters
        ----------
        text : str
            the text to add
        style : str
            the style to apply (overriden if paragraph level markdown)
        markup_dict : dict
            if set will override built in font level markup
            {font_attribute:(start_chars, end_chars)}
        para : docx.text.paragraph.Paragraph
            a pre-existing paragraph to add the text to
            if set, will ignore paragraph level markdown
        
        Returns
        -------
        para : docx.text.paragraph.Paragraph
            a paragraph added to the document
            
        """
        list_pattern = re.compile('^[-+]\s')
        number_pattern = re.compile('^\d+[.]\s')
        head_pattern = re.compile('^#+\s')
        level=0
        
        if re.match(list_pattern, text):
            style = 'List Bullet'
            text = text[len(re.findall(list_pattern, text)[0]):]
        elif re.match(number_pattern, text):
            style = 'List Number'
            text = text[len(re.findall(number_pattern, text)[0]):]
        elif re.match(head_pattern, text):
            level = len(re.findall(head_pattern, text)[0]) - 1
            text = text[level+1:]
 
        if not para:
            if level:
                para = self.add_heading(level=level)
            else:
                para = self.add_paragraph(style=style)
        if not text:
            return para
        
        sects = self._get_markup(text, markup_dict)
        for txt, markups in sects:
            run = para.add_run(txt)
            font = run.font
            for markup in markups:
                setattr(font, markup, True)

        return para


    def _split_special_paras(self, text):
        """split text into paras if a header or list,
        denominated by; # heading, - bullet or 1. numbered
        """
        patterns = ['[-+]', '\d+[.]', '#+']

        for pattern in patterns:
            if re.match(re.compile('^{}\s'.format(pattern)), text):
                starts = re.findall(re.compile('\n\s*{}\s'.format(pattern)), '\n'+text)
                texts = re.split(re.compile('\n\s*{}\s'.format(pattern)), '\n'+text)
                return [s[1:]+t for s, t in zip(starts, texts[1:])]
        
        return [text]
        

[docs]    def add_docstring(self, docstring, style='Body Text',
                      markdown=True):
        """adds a doctring to the document
            
        this function will split text into paragraphs 
        (denominated by a separating blank line)
        remove new-line characters and add to document, allowing for 
        markdown style text designated in 
        :py:func:`pygauss.docs.MSDocument.add_markdown`
        
        Parameters
        ----------
        text : str
            the text to add
        style : str
            the style to apply for each paragraph
        markdown : bool
            whether to apply markdown to the text
        
        Returns
        -------
        paras : docx.text.paragraph.Paragraph
            a list of paragraphs added to the document        

        """
        docx_paras = []
        para_pattern = re.compile('\n[\s]*\n')
        paras = re.split(para_pattern, docstring)

        # remove initial linespace if present
        if paras[0][:1] == '\n':
            paras[0] = paras[0][1:]

        for para in paras:
            if markdown:
                para = para.strip()
                for p in self._split_special_paras(para):
                    p = p.replace('\n', ' ').strip()
                    docx_paras.append(self.add_markdown(p, style=style))
            else:
                para = para.replace('\n', ' ').strip()
                docx_paras.append(self.add_paragraph(para, style=style))
    
        return docx_paras
        
[docs]    def add_list(self, text_list=[], numbered=False):
        """adds a list """
        if numbered:
            style='List Number'
        else:
            style='List Bullet'
            
        return [self.add_paragraph(tx, style=style) for tx in text_list]
    
[docs]    def add_mpl(self, fig, dpi=None, width=None, height=None, pad_inches=0.2,
                caption=None):
        """add matplotlib figure to the document 
        
        Parameters
        ----------
        fig : matplotlib.figure.Figure
            a matplotlib figure
        dpi : int
            Dots per inch
        width : float
            width of image in document
        height : float
            width of image in document
        pad_inches : float
            amount of padding around the figure
        caption : str
            a caption below the figure

        Returns
        -------
        pic : docx.shape.InlineShape
            an inline picture added to the document        

        """
        stream = BytesIO()
        fig.savefig(stream, format='png', dpi=dpi,
                    bbox_inches='tight', pad_inches=pad_inches,
                    transparent=True)
        
        width = Cm(width) if width else None
        height = Cm(height) if height else None
        
        pic = self.add_picture(stream, width=width, height=height)
        self.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER
        
        if caption is not None:
            self.add_markdown('***Figure '+ str(self._piccount) + ':*** ' + str(caption),
                              style='Caption')
        
        return pic
           
    def _add_headrw(self, cell, val):
        """ add value to header table cell """
        val = '' if val is None else str(val)
        p = cell.paragraphs[0]
        p.alignment = WD_ALIGN_PARAGRAPH.CENTER
        p.paragraph_format.keep_with_next = True
        self.add_markdown('**'+val+'**', para=p)
        return p        
    
    def _sigfigs(self, val, sig_figures=5):
        """round to significant figure"""
        if type(val) == bool:
            return val            
        try:
            if val >= 0.:
                return round(val, -int(floor(log10(val))) + (sig_figures-1))
            else:
                return -round(-val, -int(floor(log10(-val))) + (sig_figures-1))
        except Exception:
            return val

[docs]    def add_dataframe(self, df, incl_indx=True, autofit=True, sig_figures=5,
                      style='Medium List 1 Accent 1', caption=None):
        """add dataframe as a table to the document
        
        Parameters
        ----------
        df : pandas.DataFrame
            a pandas dataframe
        incl_indx : bool
            include dataframes index in table
        autofit : bool
            allow table to autofit content 
        sig_figures : int
            number of significant figures for numbers in table
        style : str
            MS Word table style
        caption : str
            add a caption below the table

        Returns
        -------
        pic : docx.table.Table
            a table added to the document        
        
        """
        df = df.fillna('-')
        rows, cols = df.shape 

        if hasattr(df.columns, 'levels'):
            hrows = len(df.columns.levels)
        else:
            hrows = 1
        
        if incl_indx:
            if hasattr(df.index, 'levels'):
                icols = len(df.index.levels)
            else:
                icols = 1
        else:
            icols = 0
            
        table = self.add_table(rows=rows+hrows, cols=cols+icols, 
                                     style=style)
        table.alignment = WD_TABLE_ALIGNMENT.CENTER
        table.autofit = autofit
         
        #add header rows
        if hasattr(df.columns, 'levels'):
            
            h_array = np.array(df.columns.tolist())
            col_count, rw_count = h_array.shape
            
            for rw in range(rw_count):
                #merge adjacent headers with the same value for higher level indexes
                rw_values = h_array[:,rw].tolist()
                start_col = end_col = icols
                while rw_values:
                    val = rw_values.pop(0)
                    if rw_values and rw < (rw_count-1):
                        while val == rw_values[0]:
                            rw_values.pop(0)
                            end_col += 1
                            if not rw_values:
                                break
                    cell = table.rows[rw].cells[start_col]
                    if not start_col == end_col:
                        cell = cell.merge(table.rows[rw].cells[end_col])
                    self._add_headrw(cell, val)
                    start_col = end_col = end_col + 1
                
        else:
            for col, val in enumerate(df.keys()):
                cell = table.rows[hrows-1].cells[col+icols]
                self._add_headrw(cell, val)

        if incl_indx:
            if hasattr(df.index, 'levels'):
               for col, name in enumerate(df.index.names):
                   cell = table.rows[hrows-1].cells[col]
                   self._add_headrw(cell, name)
                   row = hrows-2
                   #ensure all cells are formatted correctly
                   while row >= 0:
                       cell = table.rows[row].cells[col]
                       self._add_headrw(cell, '')
                       row -= 1
            else:
                cell = table.rows[hrows-1].cells[0]
                self._add_headrw(cell, df.index.name)
                row = hrows-2
                #ensure all cells are formatted correctly
                while row >= 0:
                    cell = table.rows[row].cells[0]
                    self._add_headrw(cell, '')
                    row -= 1
                    
        
                        
        #add data rows
        for row, id_series in enumerate(df.iterrows()):

            if incl_indx:
                if hasattr(df.index, 'levels'):
                    for col, val in enumerate(df.index.tolist()[row]):
                        cell = table.rows[row+hrows].cells[col]
                        self._add_headrw(cell, val)
                else:
                    cell = table.rows[row+hrows].cells[0]
                    self._add_headrw(cell, df.index[row])
                    
                    
            for col, item in enumerate(id_series[1].iteritems()):
                cell = table.rows[row+hrows].cells[col+icols]
                cell.text = str(self._sigfigs(item[1], sig_figures))
                p = cell.paragraphs[0]
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
                p.paragraph_format.keep_with_next = True

        if caption is not None:
            self.add_markdown('***Table '+ str(self._tablecount) + ':*** ' + str(caption),
                              style='Caption')

        return table