#!/usr/local/bin/python
# encoding: utf-8
"""
*Print webpages to PDF*
:Author:
David Young
:Date Created:
September 28, 2015
"""
################# GLOBAL IMPORTS ####################
import sys
import os
os.environ['TERM'] = 'vt100'
import readline
import glob
import pickle
import re
import codecs
from subprocess import Popen, PIPE, STDOUT
from docopt import docopt
from fundamentals import tools, times
import codecs
from fundamentals.files.tag import tag
# SET ENCODE ERROR RETURN VALUE
[docs]def handler(e):
return (u' ', e.start + 1)
codecs.register_error('dryx', handler)
###################################################################
# CLASSES #
###################################################################
[docs]class printpdf():
"""
*PDF printer*
**Key Arguments:**
- ``log`` -- logger
- ``settings`` -- the settings dictionary
- ``url`` -- the webpage url
- ``title`` -- title of pdf
- ``folderpath`` -- path at which to save pdf
- ``append`` -- append this at the end of the file name (not title)
- ``readability`` -- clean text with Mercury Parser
**Usage:**
To print a webpage to PDF without any cleaning of the content using the title of the webpage as filename:
.. code-block:: python
from polyglot import printpdf
pdf = printpdf(
log=log,
settings=settings,
url="https://en.wikipedia.org/wiki/Volkswagen",
folderpath="/path/to/output",
readability=False
).get()
To give the PDF an alternative title use:
.. code-block:: python
from polyglot import printpdf
pdf = printpdf(
log=log,
settings=settings,
url="https://en.wikipedia.org/wiki/Volkswagen",
folderpath="/path/to/output",
title="Cars",
readability=False
).get()
Or to append a string to the end of the filename before *.pdf* extension (useful for indexing or adding date created etc):
.. code-block:: python
from datetime import datetime, date, time
now = datetime.now()
now = now.strftime("%Y%m%dt%H%M%S")
from polyglot import printpdf
pdf = printpdf(
log=log,
settings=settings,
url="https://en.wikipedia.org/wiki/Volkswagen",
folderpath="/path/to/output",
append="_"+now,
readability=False
).get()
To clean the content using the Mercury Parser and apply some simple styling and pretty fonts:
.. code-block:: python
from polyglot import printpdf
pdf = printpdf(
log=log,
settings=settings,
url="https://en.wikipedia.org/wiki/Volkswagen",
folderpath=pathToOutputDir,
readability=True
).get()
"""
# Initialisation
[docs] def __init__(
self,
log,
settings=False,
url=False,
title=False,
folderpath=False,
append=False,
readability=True
):
self.log = log
log.debug("instansiating a new 'print' object")
self.settings = settings
self.url = url
self.folderpath = folderpath
self.title = title
self.append = append
self.readability = readability
# xt-self-arg-tmpx
# INITIAL ACTIONS
return None
[docs] def get(self):
"""
*get the PDF*
**Return:**
- ``pdfPath`` -- the path to the generated PDF
"""
self.log.info('starting the ``get`` method')
# APPEND TO FILENAME?
if not self.append:
self.append = ""
if not self.readability:
pdfPath = self._print_original_webpage()
else:
pdfPath = self._print_parsed_webpage()
tag(
log=self.log,
filepath=pdfPath,
tags=False,
rating=False,
wherefrom=self.url
)
self.log.info('completed the ``get`` method')
return pdfPath
def _print_original_webpage(
self):
"""*print the original webpage*
**Return:**
- ``pdfPath`` -- the path to the generated PDF
"""
self.log.info('starting the ``_print_original_webpage`` method')
if not self.title:
from polyglot import htmlCleaner
cleaner = htmlCleaner(
log=self.log,
settings=self.settings,
url=self.url,
outputDirectory=self.folderpath,
title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE,
style=True, # add polyglot's styling to the HTML document
# include metadata in generated HTML (e.g. title),
metadata=True,
h1=True # include title as H1 at the top of the doc
)
htmlFile = cleaner.clean()
basename = os.path.basename(htmlFile)
title = basename.replace(".html", "")
os.remove(htmlFile)
else:
title = self.title
# CONVERT TO PDF WITH ELECTON PDF
url = self.url
pdfPath = self.folderpath + "/" + title + self.append + ".pdf"
electron = self.settings["executables"]["electron path"]
cmd = """%(electron)s -i "%(url)s" -o "%(pdfPath)s" --printBackground """ % locals()
p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
stdout, stderr = p.communicate()
self.log.debug('output: %(stdout)s' % locals())
if len(stderr):
print stderr
exists = os.path.exists(pdfPath)
if not exists:
print "%(pdfPath)s was not generated for some reason - please investigate" % locals()
sys.exit(0)
self.log.info('completed the ``_print_original_webpage`` method')
return pdfPath
def _print_parsed_webpage(
self):
"""*print the parsed/cleaned webpage*
**Return:**
- ``pdfPath`` -- the path to the generated PDF
"""
self.log.info('starting the ``_print_parsed_webpage()`` method')
from polyglot import htmlCleaner
cleaner = htmlCleaner(
log=self.log,
settings=self.settings,
url=self.url,
outputDirectory=self.folderpath,
title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE,
style=True, # add polyglot's styling to the HTML document
metadata=True, # include metadata in generated HTML (e.g. title),
h1=True # include title as H1 at the top of the doc
)
htmlFile = cleaner.clean()
if not htmlFile:
return
pdfPath = htmlFile.replace(".html", self.append + ".pdf")
# CONVERT TO PDF WITH ELECTON PDF
electron = self.settings["executables"]["electron path"]
cmd = """%(electron)s -i "%(htmlFile)s" -o "%(pdfPath)s" """ % locals()
p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
stdout, stderr = p.communicate()
if len(stderr):
print stderr
self.log.debug('output: %(stdout)s' % locals())
# REMOVE HTML FILE
os.remove(htmlFile)
exists = os.path.exists(pdfPath)
if not exists:
print "%(pdfPath)s was not generated for some reason - please investigate" % locals()
sys.exit(0)
self.log.info('completed the ``_print_parsed_webpage()`` method')
return pdfPath
if __name__ == '__main__':
main()