Source code for polyglot.ebook

#!/usr/local/bin/python
# encoding: utf-8
"""
*Convert various document formats to epub or mobi*

:Author:
    David Young

:Date Created:
    October  9, 2016
"""
################# GLOBAL IMPORTS ####################
import sys
import os
import shutil
os.environ['TERM'] = 'vt100'
from fundamentals import tools
import codecs
from subprocess import Popen, PIPE
from datetime import datetime, date, time
from fundamentals.files.tag import tag


[docs]class ebook():
    """
    *The worker class for the ebook module*

    **Key Arguments:**
        - ``log`` -- logger
        - ``settings`` -- the settings dictionary
        - ``urlOrPath`` -- the url or path to the content source
        - ``bookFormat`` -- the output format (epub, mobi)
        - ``outputDirectory`` -- path to the directory to save the output html file to.
        - ``title`` -- the title of the output document. I. False then use the title of the original source. Default *False*
        - ``header`` -- content to add before the article/book content in the resulting ebook. Default *False*
        - ``footer`` -- content to add at the end of the article/book content in the resulting ebook. Default *False*

    **Usage:**

        **WebToEpub**

        To generate an ebook from an article found on the web, using the webpages's title as the filename for the book:

        .. code-block:: python

            from polyglot import ebook
            epub = ebook(
                log=log,
                settings=settings,
                urlOrPath="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
                title=False,
                bookFormat="epub",
                outputDirectory="/path/to/output/folder"
            )
            pathToEpub = epub.get()

        To add a header and footer to the epub book, and specify the title/filename for the book:

        .. code-block:: python

            from polyglot import ebook
            epub = ebook(
                log=log,
                settings=settings,
                urlOrPath="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
                title="MySQL Sucker",
                bookFormat="epub",
                outputDirectory="/path/to/output/folder",
                header='<a href="http://www.thespacedoctor.co.uk">thespacedoctor</a>',
                footer='<a href="http://www.thespacedoctor.co.uk">thespacedoctor</a>'
            )
            pathToEpub = epub.get()

        **WebToMobi**

        To generate a mobi version of the webarticle, just switch *epub* for *mobi*:

        .. code-block:: python

            from polyglot import ebook
            mobi = ebook(
                log=log,
                settings=settings,
                urlOrPath="http://www.thespacedoctor.co.uk/blog/2016/09/26/mysqlSucker-index.html",
                title="MySQL Sucker",
                bookFormat="mobi",
                outputDirectory="/path/to/output/folder",
                header='<a href="http://www.thespacedoctor.co.uk">thespacedoctor</a>',
                footer='<a href="http://www.thespacedoctor.co.uk">thespacedoctor</a>'
            )
            pathToMobi = mobi.get()

        **DocxToEpub**

        To instead convert a DOCX document to epub, simply switch out the URL for the path to the DOCX file, like so:

        .. code-block:: python

            from polyglot import ebook
            epub = ebook(
                log=log,
                settings=settings,
                urlOrPath="/path/to/Volkswagen.docx",
                title="A book about a car",
                bookFormat="epub",
                outputDirectory="/path/to/output/folder",
                header='<a href="http://www.thespacedoctor.co.uk">thespacedoctor</a>',
                footer='<a href="http://www.thespacedoctor.co.uk">thespacedoctor</a>'
            )
            pathToEpub = epub.get()

        **DocxToMobi**

        You can work it out yourself by now!
    """
    # Initialisation

[docs]    def __init__(
            self,
            log,
            settings,
            urlOrPath,
            outputDirectory,
            bookFormat,
            title=False,
            header=False,
            footer=False
    ):
        self.log = log
        log.debug("instansiating a new 'ebook' object")
        self.settings = settings
        self.title = title
        self.header = header
        self.footer = footer
        self.urlOrPath = urlOrPath
        self.outputDirectory = outputDirectory
        self.format = bookFormat
        # xt-self-arg-tmpx

        # Initial Actions

        return None

[docs]    def get(self):
        """
        *get the ebook object*

        **Return:**
            - ``ebook``

        **Usage:**

            See class docstring for usage
        """
        self.log.info('starting the ``get`` method')

        if self.format == "epub":
            if self.urlOrPath[:4] == "http" or self.urlOrPath[:4] == "www.":
                ebook = self._url_to_epub()
            elif ".docx" in self.urlOrPath:
                ebook = self._docx_to_epub()

        if self.format == "mobi":
            if self.urlOrPath[:4] == "http" or self.urlOrPath[:4] == "www.":
                epub = self._url_to_epub()
            elif ".docx" in self.urlOrPath:
                epub = self._docx_to_epub()

            if not epub:
                return None

            ebook = self._epub_to_mobi(
                epubPath=epub,
                deleteEpub=False
            )

        tag(
            log=self.log,
            filepath=ebook,
            tags=False,
            rating=False,
            wherefrom=self.url
        )

        self.log.info('completed the ``get`` method')
        return ebook

    def _url_to_epub(
            self):
        """*generate the epub book from a URL*
        """
        self.log.info('starting the ``_url_to_epub`` method')

        from polyglot import htmlCleaner
        cleaner = htmlCleaner(
            log=self.log,
            settings=self.settings,
            url=self.urlOrPath,
            outputDirectory=self.outputDirectory,
            title=self.title,  # SET TO FALSE TO USE WEBPAGE TITLE,
            style=False,  # add simpdf's styling to the HTML document
            metadata=True,  # include metadata in generated HTML (e.g. title),
            h1=False  # include title as H1 at the top of the doc
        )
        html = cleaner.clean()

        if not html:
            return None

        if self.footer:
            footer = self._tmp_html_file(self.footer)
            footer = '"%(footer)s"' % locals()
        else:
            footer = ""

        if self.header:
            header = self._tmp_html_file(self.header)
            header = '"%(header)s"' % locals()
        else:
            header = ""

        # HTML SOURCE FILE
        epub = html.replace(".html", ".epub")
        pandoc = self.settings["executables"]["pandoc"]

        cmd = """%(pandoc)s -S -s -f html -t epub3 %(header)s '%(html)s' %(footer)s -o '%(epub)s' """ % locals(
        )
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        self.log.debug('output: %(stdout)s' % locals())

        try:
            with open(epub):
                pass
            fileExists = True
        except IOError:
            fileExists = False
            raise IOError(
                "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr))

        os.remove(html)

        self.log.info('completed the ``_url_to_epub`` method')
        return epub

    def _tmp_html_file(
            self,
            content):
        """*create a tmp html file with some content used for the header or footer of the ebook*

        **Key Arguments:**
            - ``content`` -- the content to include in the HTML file.
        """
        self.log.info('starting the ``_tmp_html_file`` method')

        content = """

<hr>
<div style="text-align: center">
%(content)s
</div>
<hr>

""" % locals()

        now = datetime.now()
        now = now.strftime("%Y%m%dt%H%M%S%f")
        pathToWriteFile = "/tmp/%(now)s.html" % locals()
        try:
            self.log.debug("attempting to open the file %s" %
                           (pathToWriteFile,))
            writeFile = codecs.open(
                pathToWriteFile, encoding='utf-8', mode='w')
        except IOError, e:
            message = 'could not open the file %s' % (pathToWriteFile,)
            self.log.critical(message)
            raise IOError(message)
        writeFile.write(content)
        writeFile.close()

        self.log.info('completed the ``_tmp_html_file`` method')
        return pathToWriteFile

    def _epub_to_mobi(
            self,
            epubPath,
            deleteEpub=False):
        """*convert the give epub to mobi format using kindlegen*

        **Key Arguments:**
            - ``epubPath`` -- path to the epub book
            - ``deleteEpub`` -- delete the epub when mobi is generated. Default *False*

        **Return:**
            - ``mobi`` -- the path to the generated mobi book
        """
        self.log.info('starting the ``_epub_to_mobi`` method')

        mobi = epubPath.replace(".epub", ".mobi")
        kindlegen = self.settings["executables"]["kindlegen"]
        cmd = """%(kindlegen)s "%(epubPath)s" """ % locals(
        )
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        self.log.debug('output: %(stdout)s' % locals())

        try:
            with open(mobi):
                pass
            fileExists = True
        except IOError:
            fileExists = False
            self.log.error(
                "the mobi %s does not exist on this machine. The kindlegen error was: %s" % (mobi, stdout))
            return False

        if deleteEpub:
            os.remove(epubPath)

        self.log.info('completed the ``_epub_to_mobi`` method')
        return mobi

    def _docx_to_epub(
            self):
        """*convert docx file to epub*
        """
        self.log.info('starting the ``_docx_to_epub`` method')

        if self.footer:
            footer = self._tmp_html_file(self.footer)
            footer = '"%(footer)s"' % locals()
        else:
            footer = ""

        if self.header:
            header = self._tmp_html_file(self.header)
            header = '"%(header)s"' % locals()
        else:
            header = ""

        # FIRST CONVERT THE DOC TO HTML
        docx = self.urlOrPath

        if self.title:
            title = self.title.replace(".html", "")
            html = "/tmp/" + self.title.replace(".html", "") + ".html"
        else:
            title = os.path.basename(docx).replace(
                ".docx", "").replace("_", " ")
            html = "/tmp/" + os.path.basename(docx).replace(".docx", ".html")
        pandoc = self.settings["executables"]["pandoc"]

        # TMP IMAGE DIR
        now = datetime.now()
        now = now.strftime("%Y%m%dt%H%M%S")
        imageDir = "/tmp/%(now)s" % locals()
        if not os.path.exists(imageDir):
            os.makedirs(imageDir)
        cmd = """%(pandoc)s --extract-media=%(imageDir)s -t html -f docx '%(docx)s' -o '%(html)s' """ % locals()
        p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        stdout, stderr = p.communicate()
        self.log.debug('output: %(stdout)s' % locals())

        try:
            with open(html):
                pass
            fileExists = True
        except IOError:
            fileExists = False
            self.log.error(
                "the html %s does not exist on this machine, here is the failure message: %s" % (html, stderr))
            try:
                shutil.rmtree(imageDir)
            except:
                pass
            return None

        if fileExists:
            if self.outputDirectory:
                epub = self.outputDirectory + "/" + \
                    os.path.basename(html).replace(".html", ".epub")
            else:
                epub = docx.replace(".docx", ".epub")
            pandoc = self.settings["executables"]["pandoc"]

            cmd = """%(pandoc)s --metadata=title:'%(title)s' -S -s -f html -t epub3 %(header)s '%(html)s' %(footer)s -o '%(epub)s' """ % locals(
            )
            p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
            stdout, stderr = p.communicate()
            self.log.debug('output: %(stdout)s' % locals())

            try:
                shutil.rmtree(imageDir)
                os.remove(html)
            except:
                pass

            try:
                with open(epub):
                    pass
                fileExists = True
            except IOError:
                fileExists = False
                self.log.error(
                    "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr))
                return None

        self.log.info('completed the ``_docx_to_epub`` method')
        return epub

    # use the tab-trigger below for new method
    # xt-class-method