Source code for polyglot.printpdf

#!/usr/local/bin/python
# encoding: utf-8
"""
*Print webpages to PDF*

:Author:
    David Young

:Date Created:
    September 28, 2015
"""
################# GLOBAL IMPORTS ####################
import sys
import os
os.environ['TERM'] = 'vt100'
import readline
import glob
import pickle
import re
import codecs
from subprocess import Popen, PIPE, STDOUT
from docopt import docopt
from fundamentals import tools, times
import codecs
from fundamentals.files.tag import tag
# SET ENCODE ERROR RETURN VALUE


[docs]def handler(e): return (u' ', e.start + 1)
codecs.register_error('dryx', handler) ################################################################### # CLASSES # ###################################################################
[docs]class printpdf(): """ *PDF printer* **Key Arguments:** - ``log`` -- logger - ``settings`` -- the settings dictionary - ``url`` -- the webpage url - ``title`` -- title of pdf - ``folderpath`` -- path at which to save pdf - ``append`` -- append this at the end of the file name (not title) - ``readability`` -- clean text with Mercury Parser **Usage:** To print a webpage to PDF without any cleaning of the content using the title of the webpage as filename: .. code-block:: python from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath="/path/to/output", readability=False ).get() To give the PDF an alternative title use: .. code-block:: python from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath="/path/to/output", title="Cars", readability=False ).get() Or to append a string to the end of the filename before *.pdf* extension (useful for indexing or adding date created etc): .. code-block:: python from datetime import datetime, date, time now = datetime.now() now = now.strftime("%Y%m%dt%H%M%S") from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath="/path/to/output", append="_"+now, readability=False ).get() To clean the content using the Mercury Parser and apply some simple styling and pretty fonts: .. code-block:: python from polyglot import printpdf pdf = printpdf( log=log, settings=settings, url="https://en.wikipedia.org/wiki/Volkswagen", folderpath=pathToOutputDir, readability=True ).get() """ # Initialisation
[docs] def __init__( self, log, settings=False, url=False, title=False, folderpath=False, append=False, readability=True ): self.log = log log.debug("instansiating a new 'print' object") self.settings = settings self.url = url self.folderpath = folderpath self.title = title self.append = append self.readability = readability # xt-self-arg-tmpx # INITIAL ACTIONS return None
[docs] def get(self): """ *get the PDF* **Return:** - ``pdfPath`` -- the path to the generated PDF """ self.log.info('starting the ``get`` method') # APPEND TO FILENAME? if not self.append: self.append = "" if not self.readability: pdfPath = self._print_original_webpage() else: pdfPath = self._print_parsed_webpage() tag( log=self.log, filepath=pdfPath, tags=False, rating=False, wherefrom=self.url ) self.log.info('completed the ``get`` method') return pdfPath
def _print_original_webpage( self): """*print the original webpage* **Return:** - ``pdfPath`` -- the path to the generated PDF """ self.log.info('starting the ``_print_original_webpage`` method') if not self.title: from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.url, outputDirectory=self.folderpath, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=True, # add polyglot's styling to the HTML document # include metadata in generated HTML (e.g. title), metadata=True, h1=True # include title as H1 at the top of the doc ) htmlFile = cleaner.clean() basename = os.path.basename(htmlFile) title = basename.replace(".html", "") os.remove(htmlFile) else: title = self.title # CONVERT TO PDF WITH ELECTON PDF url = self.url pdfPath = self.folderpath + "/" + title + self.append + ".pdf" electron = self.settings["executables"]["electron path"] cmd = """%(electron)s -i "%(url)s" -o "%(pdfPath)s" --printBackground """ % locals() p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) if len(stderr): print stderr exists = os.path.exists(pdfPath) if not exists: print "%(pdfPath)s was not generated for some reason - please investigate" % locals() sys.exit(0) self.log.info('completed the ``_print_original_webpage`` method') return pdfPath def _print_parsed_webpage( self): """*print the parsed/cleaned webpage* **Return:** - ``pdfPath`` -- the path to the generated PDF """ self.log.info('starting the ``_print_parsed_webpage()`` method') from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.url, outputDirectory=self.folderpath, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=True, # add polyglot's styling to the HTML document metadata=True, # include metadata in generated HTML (e.g. title), h1=True # include title as H1 at the top of the doc ) htmlFile = cleaner.clean() if not htmlFile: return pdfPath = htmlFile.replace(".html", self.append + ".pdf") # CONVERT TO PDF WITH ELECTON PDF electron = self.settings["executables"]["electron path"] cmd = """%(electron)s -i "%(htmlFile)s" -o "%(pdfPath)s" """ % locals() p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() if len(stderr): print stderr self.log.debug('output: %(stdout)s' % locals()) # REMOVE HTML FILE os.remove(htmlFile) exists = os.path.exists(pdfPath) if not exists: print "%(pdfPath)s was not generated for some reason - please investigate" % locals() sys.exit(0) self.log.info('completed the ``_print_parsed_webpage()`` method') return pdfPath
if __name__ == '__main__': main()