# encoding: utf-8
*Convert various document formats to epub or mobi*

    David Young

:Date Created:
    October  9, 2016
################# GLOBAL IMPORTS ####################
import sys
import os
import shutil
os.environ['TERM'] = 'vt100'
from fundamentals import tools
import codecs
from subprocess import Popen, PIPE
from datetime import datetime, date, time
from fundamentals.files.tag import tag

[docs]class ebook(): """ *The worker class for the ebook module* **Key Arguments:** - ``log`` -- logger - ``settings`` -- the settings dictionary - ``urlOrPath`` -- the url or path to the content source - ``bookFormat`` -- the output format (epub, mobi) - ``outputDirectory`` -- path to the directory to save the output html file to. - ``title`` -- the title of the output document. I. False then use the title of the original source. Default *False* - ``header`` -- content to add before the article/book content in the resulting ebook. Default *False* - ``footer`` -- content to add at the end of the article/book content in the resulting ebook. Default *False* **Usage:** **WebToEpub** To generate an ebook from an article found on the web, using the webpages's title as the filename for the book: .. code-block:: python from polyglot import ebook epub = ebook( log=log, settings=settings, urlOrPath="", title=False, bookFormat="epub", outputDirectory="/path/to/output/folder" ) pathToEpub = epub.get() To add a header and footer to the epub book, and specify the title/filename for the book: .. code-block:: python from polyglot import ebook epub = ebook( log=log, settings=settings, urlOrPath="", title="MySQL Sucker", bookFormat="epub", outputDirectory="/path/to/output/folder", header='<a href="">thespacedoctor</a>', footer='<a href="">thespacedoctor</a>' ) pathToEpub = epub.get() **WebToMobi** To generate a mobi version of the webarticle, just switch *epub* for *mobi*: .. code-block:: python from polyglot import ebook mobi = ebook( log=log, settings=settings, urlOrPath="", title="MySQL Sucker", bookFormat="mobi", outputDirectory="/path/to/output/folder", header='<a href="">thespacedoctor</a>', footer='<a href="">thespacedoctor</a>' ) pathToMobi = mobi.get() **DocxToEpub** To instead convert a DOCX document to epub, simply switch out the URL for the path to the DOCX file, like so: .. code-block:: python from polyglot import ebook epub = ebook( log=log, settings=settings, urlOrPath="/path/to/Volkswagen.docx", title="A book about a car", bookFormat="epub", outputDirectory="/path/to/output/folder", header='<a href="">thespacedoctor</a>', footer='<a href="">thespacedoctor</a>' ) pathToEpub = epub.get() **DocxToMobi** You can work it out yourself by now! """ # Initialisation
[docs] def __init__( self, log, settings, urlOrPath, outputDirectory, bookFormat, title=False, header=False, footer=False ): self.log = log log.debug("instansiating a new 'ebook' object") self.settings = settings self.title = title self.header = header self.footer = footer self.urlOrPath = urlOrPath self.outputDirectory = outputDirectory self.format = bookFormat # xt-self-arg-tmpx # Initial Actions return None
[docs] def get(self): """ *get the ebook object* **Return:** - ``ebook`` **Usage:** See class docstring for usage """'starting the ``get`` method') if self.format == "epub": if self.urlOrPath[:4] == "http" or self.urlOrPath[:4] == "www.": ebook = self._url_to_epub() elif ".docx" in self.urlOrPath: ebook = self._docx_to_epub() if self.format == "mobi": if self.urlOrPath[:4] == "http" or self.urlOrPath[:4] == "www.": epub = self._url_to_epub() elif ".docx" in self.urlOrPath: epub = self._docx_to_epub() if not epub: return None ebook = self._epub_to_mobi( epubPath=epub, deleteEpub=False ) tag( log=self.log, filepath=ebook, tags=False, rating=False, wherefrom=self.url )'completed the ``get`` method') return ebook
def _url_to_epub( self): """*generate the epub book from a URL* """'starting the ``_url_to_epub`` method') from polyglot import htmlCleaner cleaner = htmlCleaner( log=self.log, settings=self.settings, url=self.urlOrPath, outputDirectory=self.outputDirectory, title=self.title, # SET TO FALSE TO USE WEBPAGE TITLE, style=False, # add simpdf's styling to the HTML document metadata=True, # include metadata in generated HTML (e.g. title), h1=False # include title as H1 at the top of the doc ) html = cleaner.clean() if not html: return None if self.footer: footer = self._tmp_html_file(self.footer) footer = '"%(footer)s"' % locals() else: footer = "" if self.header: header = self._tmp_html_file(self.header) header = '"%(header)s"' % locals() else: header = "" # HTML SOURCE FILE epub = html.replace(".html", ".epub") pandoc = self.settings["executables"]["pandoc"] cmd = """%(pandoc)s -S -s -f html -t epub3 %(header)s '%(html)s' %(footer)s -o '%(epub)s' """ % locals( ) p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) try: with open(epub): pass fileExists = True except IOError: fileExists = False raise IOError( "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr)) os.remove(html)'completed the ``_url_to_epub`` method') return epub def _tmp_html_file( self, content): """*create a tmp html file with some content used for the header or footer of the ebook* **Key Arguments:** - ``content`` -- the content to include in the HTML file. """'starting the ``_tmp_html_file`` method') content = """ <hr> <div style="text-align: center"> %(content)s </div> <hr> """ % locals() now = now = now.strftime("%Y%m%dt%H%M%S%f") pathToWriteFile = "/tmp/%(now)s.html" % locals() try: self.log.debug("attempting to open the file %s" % (pathToWriteFile,)) writeFile = pathToWriteFile, encoding='utf-8', mode='w') except IOError, e: message = 'could not open the file %s' % (pathToWriteFile,) self.log.critical(message) raise IOError(message) writeFile.write(content) writeFile.close()'completed the ``_tmp_html_file`` method') return pathToWriteFile def _epub_to_mobi( self, epubPath, deleteEpub=False): """*convert the give epub to mobi format using kindlegen* **Key Arguments:** - ``epubPath`` -- path to the epub book - ``deleteEpub`` -- delete the epub when mobi is generated. Default *False* **Return:** - ``mobi`` -- the path to the generated mobi book """'starting the ``_epub_to_mobi`` method') mobi = epubPath.replace(".epub", ".mobi") kindlegen = self.settings["executables"]["kindlegen"] cmd = """%(kindlegen)s "%(epubPath)s" """ % locals( ) p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) try: with open(mobi): pass fileExists = True except IOError: fileExists = False self.log.error( "the mobi %s does not exist on this machine. The kindlegen error was: %s" % (mobi, stdout)) return False if deleteEpub: os.remove(epubPath)'completed the ``_epub_to_mobi`` method') return mobi def _docx_to_epub( self): """*convert docx file to epub* """'starting the ``_docx_to_epub`` method') if self.footer: footer = self._tmp_html_file(self.footer) footer = '"%(footer)s"' % locals() else: footer = "" if self.header: header = self._tmp_html_file(self.header) header = '"%(header)s"' % locals() else: header = "" # FIRST CONVERT THE DOC TO HTML docx = self.urlOrPath if self.title: title = self.title.replace(".html", "") html = "/tmp/" + self.title.replace(".html", "") + ".html" else: title = os.path.basename(docx).replace( ".docx", "").replace("_", " ") html = "/tmp/" + os.path.basename(docx).replace(".docx", ".html") pandoc = self.settings["executables"]["pandoc"] # TMP IMAGE DIR now = now = now.strftime("%Y%m%dt%H%M%S") imageDir = "/tmp/%(now)s" % locals() if not os.path.exists(imageDir): os.makedirs(imageDir) cmd = """%(pandoc)s --extract-media=%(imageDir)s -t html -f docx '%(docx)s' -o '%(html)s' """ % locals() p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) try: with open(html): pass fileExists = True except IOError: fileExists = False self.log.error( "the html %s does not exist on this machine, here is the failure message: %s" % (html, stderr)) try: shutil.rmtree(imageDir) except: pass return None if fileExists: if self.outputDirectory: epub = self.outputDirectory + "/" + \ os.path.basename(html).replace(".html", ".epub") else: epub = docx.replace(".docx", ".epub") pandoc = self.settings["executables"]["pandoc"] cmd = """%(pandoc)s --metadata=title:'%(title)s' -S -s -f html -t epub3 %(header)s '%(html)s' %(footer)s -o '%(epub)s' """ % locals( ) p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) stdout, stderr = p.communicate() self.log.debug('output: %(stdout)s' % locals()) try: shutil.rmtree(imageDir) os.remove(html) except: pass try: with open(epub): pass fileExists = True except IOError: fileExists = False self.log.error( "the epub %s does not exist on this machine, here is the failure message: %s" % (epub, stderr)) return None'completed the ``_docx_to_epub`` method') return epub
