Source code for polyglot.htmlCleaner

#!/usr/local/bin/python
# encoding: utf-8
"""
*using the Mercury Parser API to clean up a local html file*

:Author:
    David Young

:Date Created:
    October  1, 2016
"""
################# GLOBAL IMPORTS ####################
import sys
import os
import codecs
import re
os.environ['TERM'] = 'vt100'
from fundamentals import tools
import requests
from fundamentals.files.tag import tag


[docs]class htmlCleaner():
    """
    *A parser/cleaner to strip a webpage article of all cruft and neatly present it with some nice css*

    **Key Arguments:**
        - ``log`` -- logger
        - ``settings`` -- the settings dictionary
        - ``url`` -- the URL to the HTML page to parse and clean
        - ``outputDirectory`` -- path to the directory to save the output html file to
        - ``title`` -- title of the document to save. If *False* will take the title of the HTML page as the filename. Default *False*.
        - ``style`` -- add polyglot's styling to the HTML document. Default *True*
        - ``metadata`` -- include metadata in generated HTML. Default *True*
        - ``h1`` -- include title as H1 at the top of the doc. Default *True*

    **Usage:**

        To generate the HTML page, using the title of the webpage as the filename:

        .. code-block:: python 

            from polyglot import htmlCleaner
            cleaner = htmlCleaner(
                log=log,
                settings=settings,
                url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
                outputDirectory="/tmp"
            )
            cleaner.clean()  

        Or specify the title of the document and remove styling, metadata and title:

        .. code-block:: python 

            from polyglot import htmlCleaner
            cleaner = htmlCleaner(
                log=log,
                settings=settings,
                url="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
                outputDirectory="/tmp",
                title="my_clean_doc",
                style=False,
                metadata=False,
                h1=False
            )
            cleaner.clean() 

    """
    # INITIALISATION

[docs]    def __init__(
            self,
            log,
            settings,
            url,
            outputDirectory=False,
            title=False,
            style=True,
            metadata=True,
            h1=True
    ):
        self.log = log
        log.debug("instansiating a new 'htmlCleaner' object")
        self.settings = settings
        self.url = url
        self.outputDirectory = outputDirectory
        self.title = title
        self.style = style
        self.metadata = metadata
        self.h1 = h1

        # INITIAL ACTIONS

        return None

[docs]    def clean(
            self):
        """*parse and clean the html document with Mercury Parser*

        **Return:**
            - ``filePath`` -- path to the cleaned HTML document

        **Usage:**

            See class usage 
        """
        self.log.info('starting the ``clean`` method')

        url = self.url

        # PARSE THE CONTENT OF THE WEBPAGE AT THE URL

        parser_response = self._request_parsed_article_from_mercury(url)
        if "503" in str(parser_response):
            return None
        article = parser_response.json()

        if not article:
            return None

        # GRAB THE CSS USED TO STYLE THE WEBPAGE/PDF CONTENT
        if self.style:
            moduleDirectory = os.path.dirname(__file__)
            cssFile = moduleDirectory + "/css/main.css"
            pathToReadFile = cssFile
            readFile = codecs.open(pathToReadFile, encoding='utf-8', mode='r')
            thisCss = readFile.read()
            readFile.close()
        else:
            thisCss = ""

        # CATCH ERRORS
        if "error" in article and article["error"] == True:
            print url
            print "    " + article["messages"]
            return None
        try:
            text = article["content"]
        except:
            print "Can't decode the text of %(url)s - moving on" % locals()
            return None

        # COMMON FIXES TO HTML TO RENDER CORRECTLY
        regex = re.compile(
            u'<span class="mw-editsection"><span class="mw-editsection-bracket">.*"mw-editsection-bracket">]')
        text = regex.sub(u"", text)
        regex2 = re.compile(
            u'\<sup class="noprint.*better source needed\<\/span\>\<\/a\>\<\/i\>\]\<\/sup\>', re.I)
        text = regex2.sub(u"", text)
        regex2 = re.compile(
            u'\<a href="https\:\/\/en\.wikipedia\.org\/wiki\/.*(\#.*)"\>\<span class=\"tocnumber\"\>', re.I)
        text = regex2.sub(u'<a href="\g<1>"><span class="tocnumber">', text)
        regex = re.compile(
            u'srcset=".*?">')
        text = regex.sub(u"", text)

        # GRAB HTML TITLE IF NOT SET IN ARGUMENTS
        if self.title == False:
            title = article["title"].encode("utf-8", "ignore")
            title = title.decode("utf-8")
            title = title.encode("ascii", "ignore")
            rstrings = """:/"&\\'`"""
            for i in rstrings:
                title = title.replace(i, "")

            # USE DATETIME IF TITLE STILL NOT SET
            if len(title) == 0:
                from datetime import datetime, date, time
                now = datetime.now()
                title = now.strftime("%Y%m%dt%H%M%S")
            self.title = title

        title = self.title.replace(".html", "")
        pageTitle = title.replace("_", " ")

        # REGENERATE THE HTML DOCUMENT WITH CUSTOM STYLE
        filePath = self.outputDirectory + "/" + title + ".html"
        writeFile = codecs.open(
            filePath, encoding='utf-8', mode='w')
        if self.metadata:
            metadata = "<title>%(title)s</title>" % locals()
        else:
            metadata = ""

        if self.h1:
            h1 = "<h1>%(pageTitle)s</h1>" % locals()
        else:
            h1 = ""
        content = u"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
%(metadata)s 

<style>
%(thisCss)s
</style>

</head>
<body>

%(h1)s 
<a href="%(url)s">original source</a>
</br></br>


%(text)s 
</body>
</html>""" % locals()
        writeFile.write(content)
        writeFile.close()

        self.log.info('completed the ``clean`` method')

        tag(
            log=self.log,
            filepath=filePath,
            tags=False,
            rating=False,
            wherefrom=self.url
        )

        return filePath

    def _request_parsed_article_from_mercury(
            self,
            url):
        """* request parsed article from mercury*

        **Key Arguments:**
            - ``url`` -- the URL to the HTML page to parse and clean

        **Return:**
            - None

        **Usage:**
            ..  todo::

                - add usage info
                - create a sublime snippet for usage
                - update package tutorial if needed

            .. code-block:: python 

                usage code 

        """
        self.log.info(
            'starting the ``_request_parsed_article_from_mercury`` method')

        try:
            response = requests.get(
                url="https://mercury.postlight.com/parser",
                params={
                    "url": url,
                },
                headers={
                    "x-api-key": self.settings["mercury api key"],
                },
            )

        except requests.exceptions.RequestException:
            print('HTTP Request failed')

        self.log.info(
            'completed the ``_request_parsed_article_from_mercury`` method')
        return response

    # use the tab-trigger below for new method
    # xt-class-method