# -*- coding: utf-8 -*- ''' Created : 2015-03-12 @author: Eric Lapouyade ''' import functools import io __version__ = '0.10.3' from lxml import etree from docx import Document from docx.opc.oxml import parse_xml from docx.opc.part import XmlPart import docx.oxml.ns from docx.opc.constants import RELATIONSHIP_TYPE as REL_TYPE from jinja2 import Environment, Template, meta from jinja2.exceptions import TemplateError try: from html import escape except ImportError: # cgi.escape is deprecated in python 3.7 from cgi import escape import re import six import binascii import os import zipfile from functools import partial NEWLINE_XML = '' NEWPARAGRAPH_XML = '' TAB_XML = '' PAGE_BREAK = '' class DocxTemplate(object): """ Class for managing docx files as they were jinja2 templates """ HEADER_URI = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/header" FOOTER_URI = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/footer" def __init__(self, docx): self.docx = Document(docx) self.crc_to_new_media = {} self.crc_to_new_embedded = {} self.zipname_to_replace = {} self.pic_to_replace = {} self.pic_map = {} def __getattr__(self, name): return getattr(self.docx, name) def xml_to_string(self, xml, encoding='unicode'): # Be careful : pretty_print MUST be set to False, otherwise patch_xml() # won't work properly return etree.tostring(xml, encoding='unicode', pretty_print=False) def get_docx(self): return self.docx def get_xml(self): return self.xml_to_string(self.docx._element.body) def write_xml(self, filename): with open(filename, 'w') as fh: fh.write(self.get_xml()) def patch_xml(self, src_xml): """ Make a lots of cleanning to have a raw xml understandable by jinja2 : strip all unnecessary xml tags, manage table cell background color and colspan, unescape html entities, etc... """ # replace {{ by {{ ( works with {{ }} {% and %} ) src_xml = re.sub(r'(?<={)(<[^>]*>)+(?=[\{%])|(?<=[%\}])(<[^>]*>)+(?=\})', '', src_xml, flags=re.DOTALL) # replace {{jinja2 stuff}} by {{jinja2 stuff}} # same thing with {% ... %} # "jinja2 stuff" could a variable, a 'if' etc... anything jinja2 will understand def striptags(m): return re.sub('.*?(|]*>)', '', m.group(0), flags=re.DOTALL) src_xml = re.sub(r'{%(?:(?!%}).)*|{{(?:(?!}}).)*', striptags, src_xml, flags=re.DOTALL) # manage table cell colspan def colspan(m): cell_xml = m.group(1) + m.group(3) cell_xml = re.sub(r'](?:(?!]).)*.*?', '', cell_xml, flags=re.DOTALL) cell_xml = re.sub(r'', '', cell_xml, count=1) return re.sub(r'(]*>)', r'\1' % m.group(2), cell_xml) src_xml = re.sub(r'(](?:(?!]).)*){%\s*colspan\s+([^%]*)\s*%}(.*?)', colspan, src_xml, flags=re.DOTALL) # manage table cell background color def cellbg(m): cell_xml = m.group(1) + m.group(3) cell_xml = re.sub(r'](?:(?!]).)*.*?', '', cell_xml, flags=re.DOTALL) cell_xml = re.sub(r'', '', cell_xml, count=1) return re.sub(r'(]*>)', r'\1' % m.group(2), cell_xml) src_xml = re.sub(r'(](?:(?!]).)*){%\s*cellbg\s+([^%]*)\s*%}(.*?)', cellbg, src_xml, flags=re.DOTALL) # ensure space preservation src_xml = re.sub(r'((?:(?!).)*)({{.*?}}|{%.*?%})', r'\1\2', src_xml, flags=re.DOTALL) src_xml = re.sub(r'({{r\s.*?}}|{%r\s.*?%})', r'\1', src_xml, flags=re.DOTALL) # {%- will merge with previous paragraph text src_xml = re.sub(r'(?:(?!).)*?{%-', '{%', src_xml, flags=re.DOTALL) # -%} will merge with next paragraph text src_xml = re.sub(r'-%}(?:(?!]).)*?]*?>', '%}', src_xml, flags=re.DOTALL) for y in ['tr', 'tc', 'p', 'r']: # replace into xml code the row/paragraph/run containing # {%y xxx %} or {{y xxx}} template tag # by {% xxx %} or {{ xx }} without any surronding tags : # This is mandatory to have jinja2 generating correct xml code pat = r'](?:(?!]).)*({%%|{{)%(y)s ([^}%%]*(?:%%}|}})).*?' % {'y': y} src_xml = re.sub(pat, r'\1 \2', src_xml, flags=re.DOTALL) # add vMerge # use {% vm %} to make this table cell and its copies be vertically merged within a {% for %} def v_merge_tc(m): def v_merge(m1): return ( '' + m1.group(1) + # Everything between ```` and ````. "{% if loop.first %}" + m1.group(2) + # Everything before ``{% vm %}``. m1.group(3) + # Everything after ``{% vm %}``. "{% endif %}" + m1.group(4) # ````. ) return re.sub( r'(].*?)(.*?)(?:{%\s*vm\s*%})(.*?)()', v_merge, m.group(), # Everything between ```` and ```` with ``{% vm %}`` inside. flags=re.DOTALL, ) src_xml = re.sub(r'](?:(?!]).)*?{%\s*vm\s*%}.*?]', v_merge_tc, src_xml, flags=re.DOTALL) # Use ``{% hm %}`` to make table cell become horizontally merged within # a ``{% for %}``. def h_merge_tc(m): xml_to_patch = m.group() # Everything between ```` and ```` with ``{% hm %}`` inside. def with_gridspan(m1): return ( m1.group(1) + # ``w:gridSpan w:val="``. '{{ ' + m1.group(2) + ' * loop.length }}' + # Content of ``w:val``, multiplied by loop length. m1.group(3) # Closing quotation mark. ) def without_gridspan(m2): return ( '' + m2.group(1) + # Everything between ```` and ````. m2.group(2) + # Everything before ``{% hm %}``. m2.group(3) + # Everything after ``{% hm %}``. m2.group(4) # ````. ) if re.search(r'w:gridSpan', xml_to_patch): # Simple case, there's already ``gridSpan``, multiply its value. xml = re.sub( r'(w:gridSpan w:val=")(\d+)(")', with_gridspan, xml_to_patch, flags=re.DOTALL, ) xml = re.sub( r'{%\s*hm\s*%}', '', xml, # Patched xml. flags=re.DOTALL, ) else: # There're no ``gridSpan``, add one. xml = re.sub( r'(].*?)(.*?)(?:{%\s*hm\s*%})(.*?)()', without_gridspan, xml_to_patch, flags=re.DOTALL, ) # Discard every other cell generated in loop. return "{% if loop.first %}" + xml + "{% endif %}" src_xml = re.sub(r'](?:(?!]).)*?{%\s*hm\s*%}.*?]', h_merge_tc, src_xml, flags=re.DOTALL) def clean_tags(m): return (m.group(0) .replace(r"‘", "'") .replace('<', '<') .replace('>', '>') .replace(u'“', u'"') .replace(u'”', u'"') .replace(u"‘", u"'") .replace(u"’", u"'")) src_xml = re.sub(r'(?<=\{[\{%])(.*?)(?=[\}%]})', clean_tags, src_xml) return src_xml def render_xml(self, src_xml, context, jinja_env=None): src_xml = src_xml.replace(r'', '\n') try: if jinja_env: template = jinja_env.from_string(src_xml) else: template = Template(src_xml) dst_xml = template.render(context) except TemplateError as exc: if hasattr(exc, 'lineno') and exc.lineno is not None: line_number = max(exc.lineno - 4, 0) exc.docx_context = map(lambda x: re.sub(r'<[^>]+>', '', x), src_xml.splitlines()[line_number:(line_number + 7)]) raise exc dst_xml = dst_xml.replace('\n', '') dst_xml = (dst_xml .replace('{_{', '{{') .replace('}_}', '}}') .replace('{_%', '{%') .replace('%_}', '%}')) dst_xml = self.resolve_listing(dst_xml) return dst_xml def resolve_listing(self, xml): xml = xml.replace('\n', NEWLINE_XML) xml = xml.replace('\f', PAGE_BREAK) def resolve_run(paragraph_properties, m): run_properties = re.search(r'.*', m[0]) run_properties = run_properties[0] if run_properties else '' xml = m[0].replace('\t', '' '%s' '%s' % (run_properties, run_properties)) return xml.replace('\a', '%s%s' % (paragraph_properties, run_properties)) def resolve_paragraph(m): paragraph_properties = re.search(r'.*', m[0]) paragraph_properties = paragraph_properties[0] if paragraph_properties else '' p_resolve_run = partial(resolve_run, paragraph_properties) return re.sub(r']*)?>.*?', p_resolve_run, m[0]) xml = re.sub(r']*)?>.*?', resolve_paragraph, xml) return xml def build_xml(self, context, jinja_env=None): xml = self.get_xml() xml = self.patch_xml(xml) xml = self.render_xml(xml, context, jinja_env) return xml def map_tree(self, tree): root = self.docx._element body = root.body root.replace(body, tree) def get_headers_footers_xml(self, uri): for relKey, val in self.docx._part._rels.items(): if (val.reltype == uri) and (val.target_part.blob): yield relKey, self.xml_to_string(parse_xml(val.target_part.blob)) def get_headers_footers_encoding(self, xml): m = re.match(r'<\?xml[^\?]+\bencoding="([^"]+)"', xml, re.I) if m: return m.group(1) return 'utf-8' def build_headers_footers_xml(self, context, uri, jinja_env=None): for relKey, xml in self.get_headers_footers_xml(uri): encoding = self.get_headers_footers_encoding(xml) xml = self.patch_xml(xml) xml = self.render_xml(xml, context, jinja_env) yield relKey, xml.encode(encoding) def map_headers_footers_xml(self, relKey, xml): part = self.docx._part._rels[relKey].target_part new_part = XmlPart.load(part.partname, part.content_type, xml, part.package) for rId, rel in part.rels.items(): new_part.load_rel(rel.reltype, rel._target, rel.rId, rel.is_external) self.docx._part._rels[relKey]._target = new_part def render(self, context, jinja_env=None, autoescape=False): if autoescape: if not jinja_env: jinja_env = Environment(autoescape=autoescape) else: jinja_env.autoescape = autoescape # Body xml_src = self.build_xml(context, jinja_env) # fix tables if needed tree = self.fix_tables(xml_src) self.map_tree(tree) # Headers headers = self.build_headers_footers_xml(context, self.HEADER_URI, jinja_env) for relKey, xml in headers: self.map_headers_footers_xml(relKey, xml) # Footers footers = self.build_headers_footers_xml(context, self.FOOTER_URI, jinja_env) for relKey, xml in footers: self.map_headers_footers_xml(relKey, xml) # using of TC tag in for cycle can cause that count of columns does not # correspond to real count of columns in row. This function is able to fix it. def fix_tables(self, xml): parser = etree.XMLParser(recover=True) tree = etree.fromstring(xml, parser=parser) # get namespace ns = '{' + tree.nsmap['w'] + '}' # walk trough xml and find table for t in tree.iter(ns+'tbl'): tblGrid = t.find(ns+'tblGrid') columns = tblGrid.findall(ns+'gridCol') to_add = 0 # walk trough all rows and try to find if there is higher cell count for r in t.iter(ns+'tr'): cells = r.findall(ns+'tc') if (len(columns) + to_add) < len(cells): to_add = len(cells) - len(columns) # is neccessary to add columns? if to_add > 0: # at first, calculate width of table according to columns # (we want to preserve it) width = 0.0 new_average = None for c in columns: if not c.get(ns+'w') is None: width += float(c.get(ns+'w')) # try to keep proportion of table if width > 0: old_average = width / len(columns) new_average = width / (len(columns) + to_add) # scale the old columns for c in columns: c.set(ns+'w', str(int(float(c.get(ns+'w')) * new_average/old_average))) # add new columns for i in range(to_add): etree.SubElement(tblGrid, ns+'gridCol', {ns+'w': str(int(new_average))}) # Refetch columns after columns addition. columns = tblGrid.findall(ns + 'gridCol') columns_len = len(columns) cells_len_max = 0 def get_cell_len(total, cell): tc_pr = cell.find(ns + 'tcPr') grid_span = None if tc_pr is None else tc_pr.find(ns + 'gridSpan') if grid_span is not None: return total + int(grid_span.get(ns + 'val')) return total + 1 # Calculate max of table cells to compare with `gridCol`. for r in t.iter(ns + 'tr'): cells = r.findall(ns + 'tc') cells_len = functools.reduce(get_cell_len, cells, 0) cells_len_max = max(cells_len_max, cells_len) to_remove = columns_len - cells_len_max # If after the loop, there're less columns, than # originally was, remove extra `gridCol` declarations. if to_remove > 0: # Have to keep track of the removed width to scale the # table back to its original width. removed_width = 0.0 for c in columns[-to_remove:]: removed_width += float(c.get(ns + 'w')) tblGrid.remove(c) columns_left = tblGrid.findall(ns + 'gridCol') # Distribute `removed_width` across all columns that has # left after extras removal. extra_space = 0 if len(columns_left) > 0: extra_space = removed_width / len(columns_left) extra_space = int(extra_space) for c in columns_left: c.set(ns+'w', str(int(float(c.get(ns+'w')) + extra_space))) return tree def new_subdoc(self, docpath=None): return Subdoc(self, docpath) @staticmethod def get_file_crc(file_obj): if hasattr(file_obj, 'read'): buf = file_obj.read() else: with open(file_obj, 'rb') as fh: buf = fh.read() crc = (binascii.crc32(buf) & 0xFFFFFFFF) return crc def replace_media(self, src_file, dst_file): """Replace one media by another one into a docx This has been done mainly because it is not possible to add images in docx header/footer. With this function, put a dummy picture in your header/footer, then specify it with its replacement in this function using the file path or file-like objects. Syntax: tpl.replace_media('dummy_media_to_replace.png','media_to_paste.jpg') -- or -- tpl.replace_media(io.BytesIO(image_stream), io.BytesIO(new_image_stream)) Note: for images, the aspect ratio will be the same as the replaced image Note2: it is important to have the source media file as it is required to calculate its CRC to find them in the docx """ crc = self.get_file_crc(src_file) if hasattr(dst_file, 'read'): self.crc_to_new_media[crc] = dst_file.read() else: with open(dst_file, 'rb') as fh: self.crc_to_new_media[crc] = fh.read() def replace_pic(self, embedded_file, dst_file): """Replace embedded picture with original-name given by embedded_file. (give only the file basename, not the full path) The new picture is given by dst_file (either a filename or a file-like object) Notes: 1) embedded_file and dst_file must have the same extension/format in case dst_file is a file-like object, no check is done on format compatibility 2) the aspect ratio will be the same as the replaced image 3) There is no need to keep the original file (this is not the case for replace_embedded and replace_media) """ if hasattr(dst_file, 'read'): # NOTE: file extension not checked self.pic_to_replace[embedded_file] = dst_file.read() else: with open(dst_file, 'rb') as fh: self.pic_to_replace[embedded_file] = fh.read() def replace_embedded(self, src_file, dst_file): """Replace one embedded object by another one into a docx This has been done mainly because it is not possible to add images in docx header/footer. With this function, put a dummy picture in your header/footer, then specify it with its replacement in this function Syntax: tpl.replace_embedded('dummy_doc.docx','doc_to_paste.docx') Note2 : it is important to have the source file as it is required to calculate its CRC to find them in the docx """ with open(dst_file, 'rb') as fh: crc = self.get_file_crc(src_file) self.crc_to_new_embedded[crc] = fh.read() def replace_zipname(self, zipname, dst_file): """Replace one file in the docx file First note that a MSWord .docx file is in fact a zip file. This method can be used to replace document embedded in the docx template. Some embedded document may have been modified by MSWord while saving the template : thus replace_embedded() cannot be used as CRC is not the same as the original file. This method works for embedded MSWord file like Excel or PowerPoint file, but won't work for others like PDF, Python or even Text files : For these ones, MSWord generate an oleObjectNNN.bin file which is no use to be replaced as it is encoded. Syntax: tpl.replace_zipname( 'word/embeddings/Feuille_Microsoft_Office_Excel1.xlsx', 'my_excel_file.xlsx') The zipname is the one you can find when you open docx with WinZip, 7zip (Windows) or unzip -l (Linux). The zipname starts with "word/embeddings/". Note that the file is renamed by MSWord, so you have to guess a little bit... """ with open(dst_file, 'rb') as fh: self.zipname_to_replace[zipname] = fh.read() def post_processing(self, docx_file): if (self.crc_to_new_media or self.crc_to_new_embedded or self.zipname_to_replace): if hasattr(docx_file, 'read'): tmp_file = io.BytesIO() DocxTemplate(docx_file).save(tmp_file) tmp_file.seek(0) docx_file.seek(0) docx_file.truncate() docx_file.seek(0) else: tmp_file = '%s_docxtpl_before_replace_medias' % docx_file os.rename(docx_file, tmp_file) with zipfile.ZipFile(tmp_file) as zin: with zipfile.ZipFile(docx_file, 'w') as zout: for item in zin.infolist(): buf = zin.read(item.filename) if item.filename in self.zipname_to_replace: zout.writestr(item, self.zipname_to_replace[item.filename]) elif (item.filename.startswith('word/media/') and item.CRC in self.crc_to_new_media): zout.writestr(item, self.crc_to_new_media[item.CRC]) elif (item.filename.startswith('word/embeddings/') and item.CRC in self.crc_to_new_embedded): zout.writestr(item, self.crc_to_new_embedded[item.CRC]) else: zout.writestr(item, buf) if not hasattr(tmp_file, 'read'): os.remove(tmp_file) if hasattr(docx_file, 'read'): docx_file.seek(0) def pre_processing(self): if self.pic_to_replace: self.build_pic_map() # Do the actual replacement for embedded_file, stream in six.iteritems(self.pic_to_replace): if embedded_file not in self.pic_map: raise ValueError('Picture "%s" not found in the docx template' % embedded_file) self.pic_map[embedded_file][1]._blob = stream def build_pic_map(self): """Searches in docx template all the xml pictures tag and store them in pic_map dict""" if self.pic_to_replace: # Main document part = self.docx.part self.pic_map.update(self._img_filename_to_part(part)) # Header/Footer for relid, rel in six.iteritems(self.docx.part.rels): if rel.reltype in (REL_TYPE.HEADER, REL_TYPE.FOOTER): self.pic_map.update(self._img_filename_to_part(rel.target_part)) def get_pic_map(self): return self.pic_map def _img_filename_to_part(self, doc_part): et = etree.fromstring(doc_part.blob) part_map = {} gds = et.xpath('//a:graphic/a:graphicData', namespaces=docx.oxml.ns.nsmap) for gd in gds: rel = None # Either IMAGE, CHART, SMART_ART, ... try: if gd.attrib['uri'] == docx.oxml.ns.nsmap['pic']: # Either PICTURE or LINKED_PICTURE image blip = gd.xpath('pic:pic/pic:blipFill/a:blip', namespaces=docx.oxml.ns.nsmap)[0] dest = blip.xpath('@r:embed', namespaces=docx.oxml.ns.nsmap) if len(dest) > 0: rel = dest[0] else: continue else: continue # title=inl.xpath('wp:docPr/@title',namespaces=docx.oxml.ns.nsmap)[0] name = gd.xpath('pic:pic/pic:nvPicPr/pic:cNvPr/@name', namespaces=docx.oxml.ns.nsmap)[0] part_map[name] = (doc_part.rels[rel].target_ref, doc_part.rels[rel].target_part) # FIXME: figure out what exceptions are thrown here and catch more specific exceptions except Exception: continue return part_map def build_url_id(self, url): return self.docx._part.relate_to(url, REL_TYPE.HYPERLINK, is_external=True) def save(self, filename, *args, **kwargs): self.pre_processing() self.docx.save(filename, *args, **kwargs) self.post_processing(filename) def get_undeclared_template_variables(self, jinja_env=None): xml = self.get_xml() xml = self.patch_xml(xml) for uri in [self.HEADER_URI, self.FOOTER_URI]: for relKey, _xml in self.get_headers_footers_xml(uri): xml += self.patch_xml(_xml) if jinja_env: env = jinja_env else: env = Environment() parse_content = env.parse(xml) return meta.find_undeclared_variables(parse_content) undeclared_template_variables = property(get_undeclared_template_variables) class Subdoc(object): """ Class for subdocument to insert into master document """ def __init__(self, tpl, docpath=None): self.tpl = tpl self.docx = tpl.get_docx() self.subdocx = Document(docpath) self.subdocx._part = self.docx._part def __getattr__(self, name): return getattr(self.subdocx, name) def _get_xml(self): if self.subdocx._element.body.sectPr is not None: self.subdocx._element.body.remove(self.subdocx._element.body.sectPr) xml = re.sub(r']*>', '', etree.tostring( self.subdocx._element.body, encoding='unicode', pretty_print=False)) return xml def __unicode__(self): return self._get_xml() def __str__(self): return self._get_xml() def __html__(self): return self._get_xml() class RichText(object): """ class to generate Rich Text when using templates variables This is much faster than using Subdoc class, but this only for texts INSIDE an existing paragraph. """ def __init__(self, text=None, **text_prop): self.xml = '' if text: self.add(text, **text_prop) def add(self, text, style=None, color=None, highlight=None, size=None, subscript=None, superscript=None, bold=False, italic=False, underline=False, strike=False, font=None, url_id=None): # If a RichText is added if isinstance(text, RichText): self.xml += text.xml return # If not a string : cast to string (ex: int, dict etc...) if not isinstance(text, (six.text_type, six.binary_type)): text = six.text_type(text) if not isinstance(text, six.text_type): text = text.decode('utf-8', errors='ignore') text = (escape(text) .replace('\n', NEWLINE_XML) .replace('\a', NEWPARAGRAPH_XML) .replace('\t', TAB_XML) .replace('\f', PAGE_BREAK)) prop = u'' if style: prop += u'' % style if color: if color[0] == '#': color = color[1:] prop += u'' % color if highlight: if highlight[0] == '#': highlight = highlight[1:] prop += u'' % highlight if size: prop += u'' % size prop += u'' % size if subscript: prop += u'' if superscript: prop += u'' if bold: prop += u'' if italic: prop += u'' if underline: if underline not in ['single', 'double', 'thick', 'dotted', 'dash', 'dotDash', 'dotDotDash', 'wave']: underline = 'single' prop += u'' % underline if strike: prop += u'' if font: prop += (u'' .format(font=font)) xml = u'' if prop: xml += u'%s' % prop xml += u'%s' % text if url_id: xml = (u'%s' % (url_id, xml)) self.xml += xml def __unicode__(self): return self.xml def __str__(self): return self.xml def __html__(self): return self.xml R = RichText class Listing(object): r"""class to manage \n and \a without to use RichText, by this way you keep the current template styling use {{ mylisting }} in your template and context={ mylisting:Listing(the_listing_with_newlines) } """ def __init__(self, text): # If not a string : cast to string (ex: int, dict etc...) if not isinstance(text, (six.text_type, six.binary_type)): text = six.text_type(text) self.xml = (escape(text) .replace('\n', NEWLINE_XML) .replace('\a', NEWPARAGRAPH_XML) .replace('\t', TAB_XML) .replace('\f', PAGE_BREAK)) def __unicode__(self): return self.xml def __str__(self): return self.xml def __html__(self): return self.xml class InlineImage(object): """Class to generate an inline image This is much faster than using Subdoc class. """ tpl = None image_descriptor = None width = None height = None def __init__(self, tpl, image_descriptor, width=None, height=None): self.tpl, self.image_descriptor = tpl, image_descriptor self.width, self.height = width, height def _insert_image(self): pic = self.tpl.docx._part.new_pic_inline( self.image_descriptor, self.width, self.height ).xml return '%s' \ '' % pic def __unicode__(self): return self._insert_image() def __str__(self): return self._insert_image() def __html__(self): return self._insert_image()