Source code for polyglot.markdown.kindle_notebook

#!/usr/local/bin/python
# encoding: utf-8
"""
*Convert the HTML export of kindle notebooks (from kindle apps) to markdown*

:Author:
    David Young

:Date Created:
    October 17, 2016
"""
################# GLOBAL IMPORTS ####################
import sys
import os
import re
import collections
os.environ['TERM'] = 'vt100'
from fundamentals import tools

# THESE ARE THE 4 KINDLE COLORS ARE HOW THEY TRANSLATE TO MD
colorCode = {
    "blue": "code",
    "yellow": "text",
    "orange": "quote",
    "pink": "header"
}


[docs]class kindle_notebook():
    """
    *convert the HTML export of kindle notebooks (from kindle apps) to markdown*

    **Key Arguments:**
        - ``log`` -- logger
        - ``kindleExportPath`` -- path to the exported kindle HTML file
        - ``outputPath`` -- the output path to the md file.

    **Usage:**

        To convert the exported HTML file of annotation and notes from a kindle book or document to markdown, run the code:

        .. code-block:: python 

            from polyglot.markdown import kindle_notebook
            nb = kindle_notebook(
                log=log,
                kindleExportPath="/path/to/kindle_export.html",
                outputPath="/path/to/coverted_annotations.md"
            )
            nb.convert()

        The colours of the annotations convert to markdown attributes via the following key:

        .. code-block: json

            colorCode = {
                "blue": "code",
                "yellow": "text",
                "orange": "quote",
                "pink": "header"
            }
    """
    # Initialisation

[docs]    def __init__(
            self,
            log,
            kindleExportPath,
            outputPath
    ):
        self.log = log
        log.debug("instansiating a new 'kindle_notebook' object")
        self.kindleExportPath = kindleExportPath
        self.outputPath = outputPath

        # xt-self-arg-tmpx

        # Initial Actions

        return None

[docs]    def convert(self):
        """
        *convert the kindle_notebook object*

        **Return:**
            - ``kindle_notebook``

        **Usage:**
        .. todo::

            - add usage info
            - create a sublime snippet for usage
            - update the package tutorial if needed

        .. code-block:: python 

            usage code 
        """
        self.log.info('starting the ``convert`` method')

        import codecs
        pathToReadFile = self.kindleExportPath
        try:
            self.log.debug("attempting to open the file %s" %
                           (pathToReadFile,))
            readFile = codecs.open(pathToReadFile, encoding='utf-8', mode='r')
            annotations = readFile.read()
            readFile.close()
        except IOError, e:
            message = 'could not open the file %s' % (pathToReadFile,)
            self.log.critical(message)
            raise IOError(message)

        annotations = annotations.replace(u"’", "'").replace(
            u"“ ", '"').replace(u"“", '"').replace(u"”", '"').replace(u"–", "-").replace(u"—", "-")

        # COLLECT KEY COMPONENTS
        try:
            title = self.find_component("bookTitle", annotations)
        except:
            return False
        regex = re.compile(r'_xx\d*xx$')
        title = regex.sub("", title)
        authors = self.find_component("authors", annotations)
        citation = self.find_component("citation", annotations)

        # CLEAN THE CITATION
        regex = re.compile(r'</?i>', re.S)
        citation = regex.sub('*', citation)
        regex = re.compile(r'Citation \(.*?\): ', re.S)
        citation = regex.sub('', citation).replace(" Kindle edition.", "")

        # COLLECT ANNOTATIONS
        annotationDict = {}
        matchObject = re.finditer(
            r"""<div class="noteHeading">\s+Highlight\(<span.*?>(?P<color>.*?)</span>\)((?P<section>.*?)Page (?P<page>\d+))?.*?Location (?P<location>\d+)\s+</div>\s+<div class="noteText">(?P<note>.*?)</div>""",
            annotations,
            flags=re.S
        )

        for match in matchObject:
            location = int(match.group("location"))
            location = "%(location)09d" % locals()
            if match.group("page"):
                try:
                    annotationDict[location] = {"color": match.group("color"), "page": match.group(
                        "page"), "section": self.clean(match.group("section"))[3:-2], "note": self.clean(match.group("note"))}
                except:
                    print match.group("note")
                    sys.exit(0)
            else:
                try:
                    annotationDict[location] = {"color": match.group(
                        "color"), "note": self.clean(match.group("note"))}
                except:
                    print match.group("note")
                    sys.exit(0)

        # COLLECT PERSONAL NOTES
        matchObject = re.finditer(
            r"""<div class="noteHeading">\s+Note -( Page (?P<page>\d+))?.*?Location (?P<location>\d+)\s+</div>\s+<div class="noteText">(?P<note>.*?)</div>""",
            annotations,
            flags=re.S
        )

        for match in matchObject:
            location = int(match.group("location"))
            location = "%(location)09dnote" % locals()
            if match.group("page"):
                annotationDict[location] = {"color": None, "page": match.group(
                    "page"), "note": self.clean(match.group("note"))}
            else:
                annotationDict[location] = {
                    "color": None, "note": self.clean(match.group("note"))}

        annotationDict = collections.OrderedDict(
            sorted(annotationDict.items()))

        mdContent = "\n# %(title)s\n\nAuthors: **%(authors)s**\n\n" % locals()
        for k, v in annotationDict.iteritems():
            mdContent += self.convertToMD(v) + "\n\n"

        if len(annotationDict) == 0:
            return False

        pathToWriteFile = self.outputPath
        try:
            self.log.debug("attempting to open the file %s" %
                           (pathToWriteFile,))
            writeFile = codecs.open(
                pathToWriteFile, encoding='utf-8', mode='w')
        except IOError, e:
            message = 'could not open the file %s' % (pathToWriteFile,)
            self.log.critical(message)
            raise IOError(message)
        writeFile.write(mdContent)
        writeFile.close()

        self.log.info('completed the ``convert`` method')
        return pathToWriteFile

[docs]    def clean(self, text):
        return text.strip().replace(u"’", "'").replace(u"“ ", '"').replace(u"“", '"').replace(u"”", '"').replace(u"–", "-").replace(u"—", "-")

[docs]    def find_component(self, divtag, annotations):
        component = re.search(
            r"""<div class="%(divtag)s">(.*?)</div>""" % locals(), annotations, re.S)

        return self.clean(component.group(1))

[docs]    def convertToMD(self, kindleNote):
        if kindleNote["color"] == None:
            return "**NOTE**\n: " + kindleNote["note"].replace("\n", " ")
        mdType = colorCode[kindleNote["color"]]
        if mdType == "code":
            return "```\n" + kindleNote["note"] + "\n```"
        elif mdType == "text":
            return kindleNote["note"]
        elif mdType == "header":
            regex = re.compile(r'_xx\d*xx$')
            kindleNote["note"] = regex.sub("", kindleNote["note"])
            return "## " + kindleNote["note"].replace("\n", " ").replace("  ", " ").replace("  ", " ").replace("  ", " ")
        elif mdType == "quote":
            return "> " + kindleNote["note"].replace("\n", "> ")

    # xt-class-method

    # 5. @flagged: what actions of the base class(es) need ammending? ammend them here
    # Override Method Attributes
    # method-override-tmpx