Source code for

# -*- coding: utf-8 -*-
Created on Tue Jun 16 15:52:53 2015

@author: chris sewell
from math import log10, floor
from io import BytesIO
import re

from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.section import WD_ORIENT
from docx.shared import Cm

from pandas import DataFrame
import numpy as np

[docs]class MSDocument(object): """a class to output a Microsoft Word Document NB: docx.api.Document can't be directly inherited as it is a function which returns various classes dependent on the *docx* parameter """ def __init__(self, docx=None): """a class to output a Microsoft Word Document inherited api details for :py:class:`docx.document.Document` can be found at; the class has an internal state for the number of calls to add_picture and add_table for use in caption numbering Parameters ---------- docx : str or file-like object can be either a path to a .docx file (a string) or a file-like object. If docx is missing or None, the built-in default document “template” is loaded. """ self._docx = Document(docx=docx) self._piccount = 0 self._tablecount = 0
[docs] def __getattr__(self, name): """ required to get :py:class:`docx.document.Document` methods """ return getattr(self._docx, name)
[docs] def __dir__(self): """ required to have :py:class:`docx.document.Document` methods in :py:mod:`IPython` tab completion""" dirlist = self.__class__.__dict__.keys() + self._docx.__class__.__dict__.keys() return sorted(dirlist)
[docs] def add_picture(self, image_path_or_stream, width=None, height=None): """ Return a new picture shape added in its own paragraph at the end of the document. The picture contains the image at *image_path_or_stream*, scaled based on *width* and *height*. If neither width nor height is specified, the picture appears at its native size. If only one is specified, it is used to compute a scaling factor that is then applied to the unspecified dimension, preserving the aspect ratio of the image. The native size of the picture is calculated using the dots-per-inch (dpi) value specified in the image file, defaulting to 72 dpi if no value is specified, as is often the case. """ self._piccount += 1 return self._docx.add_picture(image_path_or_stream, width, height)
[docs] def add_table(self, rows, cols, style=None): """Add a table having row and column counts of *rows* and *cols* respectively and table style of *style*. *style* may be a paragraph style object or a paragraph style name. If *style* is |None|, the table inherits the default table style of the document. """ self._tablecount += 1 return self._docx.add_table(rows, cols, style)
_MARKUPS = { 'italic':('*','*'), 'bold':('**', '**'), 'subscript':('_{', '}'), 'superscript':('^{', '}'), 'strike':('~~','~~'), 'math': ('$', '$') } def _get_markup(self, para, markup_dict=None): """get markup """ if not markup_dict: markup_dict = self._MARKUPS df = DataFrame(markup_dict, index=['Enter', 'Exit']).T df['In']=False sects=[] place=0 while place > -1: place = -1 markup = None estr = None for mark, enter in df[df.In==False].Enter.iterkv(): find = para.find(enter) if find > -1 and (find<=place or place==-1): if find == place and len(enter) < len(estr): continue place = find markup = mark estr = enter for mark, exit in df[df.In==True].Exit.iterkv(): find = para.find(exit) if find > -1 and (find<=place or place==-1): if find == place and len(exit) < len(estr): continue place = find markup = mark estr = exit if place > -1: sects.append([para[:place], df[df.In==True].index.tolist()]) df.set_value(markup, 'In', not df.get_value(markup, 'In')) para = para[place+len(estr):] if df.In.any(): raise ValueError( 'the markup does not exit from;\n{}'.format(df[df.In==True])) sects.append([para, []]) return sects
[docs] def add_markdown(self, text='', style='Body Text', markup_dict=None, para=None): r"""adds a paragraph to the document, allowing for paragraph/font styling akin to a stripped down version of markdown text: paragraph level:: # Header (level denoted by number of #'s) - bullet list 1. numbered list font level:: **bold** *italic* _{subscript} ^{superscript} ~~strikethrough~~ $mathML$ Parameters ---------- text : str the text to add style : str the style to apply (overriden if paragraph level markdown) markup_dict : dict if set will override built in font level markup {font_attribute:(start_chars, end_chars)} para : docx.text.paragraph.Paragraph a pre-existing paragraph to add the text to if set, will ignore paragraph level markdown Returns ------- para : docx.text.paragraph.Paragraph a paragraph added to the document """ list_pattern = re.compile('^[-+]\s') number_pattern = re.compile('^\d+[.]\s') head_pattern = re.compile('^#+\s') level=0 if re.match(list_pattern, text): style = 'List Bullet' text = text[len(re.findall(list_pattern, text)[0]):] elif re.match(number_pattern, text): style = 'List Number' text = text[len(re.findall(number_pattern, text)[0]):] elif re.match(head_pattern, text): level = len(re.findall(head_pattern, text)[0]) - 1 text = text[level+1:] if not para: if level: para = self.add_heading(level=level) else: para = self.add_paragraph(style=style) if not text: return para sects = self._get_markup(text, markup_dict) for txt, markups in sects: run = para.add_run(txt) font = run.font for markup in markups: setattr(font, markup, True) return para
def _split_special_paras(self, text): """split text into paras if a header or list, denominated by; # heading, - bullet or 1. numbered """ patterns = ['[-+]', '\d+[.]', '#+'] for pattern in patterns: if re.match(re.compile('^{}\s'.format(pattern)), text): starts = re.findall(re.compile('\n\s*{}\s'.format(pattern)), '\n'+text) texts = re.split(re.compile('\n\s*{}\s'.format(pattern)), '\n'+text) return [s[1:]+t for s, t in zip(starts, texts[1:])] return [text]
[docs] def add_docstring(self, docstring, style='Body Text', markdown=True): """adds a doctring to the document this function will split text into paragraphs (denominated by a separating blank line) remove new-line characters and add to document, allowing for markdown style text designated in :py:func:`` Parameters ---------- text : str the text to add style : str the style to apply for each paragraph markdown : bool whether to apply markdown to the text Returns ------- paras : docx.text.paragraph.Paragraph a list of paragraphs added to the document """ docx_paras = [] para_pattern = re.compile('\n[\s]*\n') paras = re.split(para_pattern, docstring) # remove initial linespace if present if paras[0][:1] == '\n': paras[0] = paras[0][1:] for para in paras: if markdown: para = para.strip() for p in self._split_special_paras(para): p = p.replace('\n', ' ').strip() docx_paras.append(self.add_markdown(p, style=style)) else: para = para.replace('\n', ' ').strip() docx_paras.append(self.add_paragraph(para, style=style)) return docx_paras
[docs] def add_list(self, text_list=[], numbered=False): """adds a list """ if numbered: style='List Number' else: style='List Bullet' return [self.add_paragraph(tx, style=style) for tx in text_list]
[docs] def add_mpl(self, fig, dpi=None, width=None, height=None, pad_inches=0.2, caption=None): """add matplotlib figure to the document Parameters ---------- fig : matplotlib.figure.Figure a matplotlib figure dpi : int Dots per inch width : float width of image in document height : float width of image in document pad_inches : float amount of padding around the figure caption : str a caption below the figure Returns ------- pic : docx.shape.InlineShape an inline picture added to the document """ stream = BytesIO() fig.savefig(stream, format='png', dpi=dpi, bbox_inches='tight', pad_inches=pad_inches, transparent=True) width = Cm(width) if width else None height = Cm(height) if height else None pic = self.add_picture(stream, width=width, height=height) self.paragraphs[-1].alignment = WD_ALIGN_PARAGRAPH.CENTER if caption is not None: self.add_markdown('***Figure '+ str(self._piccount) + ':*** ' + str(caption), style='Caption') return pic
def _add_headrw(self, cell, val): """ add value to header table cell """ val = '' if val is None else str(val) p = cell.paragraphs[0] p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.keep_with_next = True self.add_markdown('**'+val+'**', para=p) return p def _sigfigs(self, val, sig_figures=5): """round to significant figure""" if type(val) == bool: return val try: if val >= 0.: return round(val, -int(floor(log10(val))) + (sig_figures-1)) else: return -round(-val, -int(floor(log10(-val))) + (sig_figures-1)) except Exception: return val
[docs] def add_dataframe(self, df, incl_indx=True, autofit=True, sig_figures=5, style='Medium List 1 Accent 1', caption=None): """add dataframe as a table to the document Parameters ---------- df : pandas.DataFrame a pandas dataframe incl_indx : bool include dataframes index in table autofit : bool allow table to autofit content sig_figures : int number of significant figures for numbers in table style : str MS Word table style caption : str add a caption below the table Returns ------- pic : docx.table.Table a table added to the document """ df = df.fillna('-') rows, cols = df.shape if hasattr(df.columns, 'levels'): hrows = len(df.columns.levels) else: hrows = 1 if incl_indx: if hasattr(df.index, 'levels'): icols = len(df.index.levels) else: icols = 1 else: icols = 0 table = self.add_table(rows=rows+hrows, cols=cols+icols, style=style) table.alignment = WD_TABLE_ALIGNMENT.CENTER table.autofit = autofit #add header rows if hasattr(df.columns, 'levels'): h_array = np.array(df.columns.tolist()) col_count, rw_count = h_array.shape for rw in range(rw_count): #merge adjacent headers with the same value for higher level indexes rw_values = h_array[:,rw].tolist() start_col = end_col = icols while rw_values: val = rw_values.pop(0) if rw_values and rw < (rw_count-1): while val == rw_values[0]: rw_values.pop(0) end_col += 1 if not rw_values: break cell = table.rows[rw].cells[start_col] if not start_col == end_col: cell = cell.merge(table.rows[rw].cells[end_col]) self._add_headrw(cell, val) start_col = end_col = end_col + 1 else: for col, val in enumerate(df.keys()): cell = table.rows[hrows-1].cells[col+icols] self._add_headrw(cell, val) if incl_indx: if hasattr(df.index, 'levels'): for col, name in enumerate(df.index.names): cell = table.rows[hrows-1].cells[col] self._add_headrw(cell, name) row = hrows-2 #ensure all cells are formatted correctly while row >= 0: cell = table.rows[row].cells[col] self._add_headrw(cell, '') row -= 1 else: cell = table.rows[hrows-1].cells[0] self._add_headrw(cell, row = hrows-2 #ensure all cells are formatted correctly while row >= 0: cell = table.rows[row].cells[0] self._add_headrw(cell, '') row -= 1 #add data rows for row, id_series in enumerate(df.iterrows()): if incl_indx: if hasattr(df.index, 'levels'): for col, val in enumerate(df.index.tolist()[row]): cell = table.rows[row+hrows].cells[col] self._add_headrw(cell, val) else: cell = table.rows[row+hrows].cells[0] self._add_headrw(cell, df.index[row]) for col, item in enumerate(id_series[1].iteritems()): cell = table.rows[row+hrows].cells[col+icols] cell.text = str(self._sigfigs(item[1], sig_figures)) p = cell.paragraphs[0] p.alignment = WD_ALIGN_PARAGRAPH.CENTER p.paragraph_format.keep_with_next = True if caption is not None: self.add_markdown('***Table '+ str(self._tablecount) + ':*** ' + str(caption), style='Caption') return table