#!/usr/local/bin/python
# encoding: utf-8
"""
*Convert the HTML export of kindle notebooks (from kindle apps) to markdown*
:Author:
David Young
:Date Created:
October 17, 2016
"""
################# GLOBAL IMPORTS ####################
import sys
import os
import re
import collections
os.environ['TERM'] = 'vt100'
from fundamentals import tools
# THESE ARE THE 4 KINDLE COLORS ARE HOW THEY TRANSLATE TO MD
colorCode = {
"blue": "code",
"yellow": "text",
"orange": "quote",
"pink": "header"
}
[docs]class kindle_notebook():
"""
*convert the HTML export of kindle notebooks (from kindle apps) to markdown*
**Key Arguments:**
- ``log`` -- logger
- ``kindleExportPath`` -- path to the exported kindle HTML file
- ``outputPath`` -- the output path to the md file.
**Usage:**
To convert the exported HTML file of annotation and notes from a kindle book or document to markdown, run the code:
.. code-block:: python
from polyglot.markdown import kindle_notebook
nb = kindle_notebook(
log=log,
kindleExportPath="/path/to/kindle_export.html",
outputPath="/path/to/coverted_annotations.md"
)
nb.convert()
The colours of the annotations convert to markdown attributes via the following key:
.. code-block: json
colorCode = {
"blue": "code",
"yellow": "text",
"orange": "quote",
"pink": "header"
}
"""
# Initialisation
[docs] def __init__(
self,
log,
kindleExportPath,
outputPath
):
self.log = log
log.debug("instansiating a new 'kindle_notebook' object")
self.kindleExportPath = kindleExportPath
self.outputPath = outputPath
# xt-self-arg-tmpx
# Initial Actions
return None
[docs] def convert(self):
"""
*convert the kindle_notebook object*
**Return:**
- ``kindle_notebook``
**Usage:**
.. todo::
- add usage info
- create a sublime snippet for usage
- update the package tutorial if needed
.. code-block:: python
usage code
"""
self.log.info('starting the ``convert`` method')
import codecs
pathToReadFile = self.kindleExportPath
try:
self.log.debug("attempting to open the file %s" %
(pathToReadFile,))
readFile = codecs.open(pathToReadFile, encoding='utf-8', mode='r')
annotations = readFile.read()
readFile.close()
except IOError, e:
message = 'could not open the file %s' % (pathToReadFile,)
self.log.critical(message)
raise IOError(message)
annotations = annotations.replace(u"’", "'").replace(
u"“ ", '"').replace(u"“", '"').replace(u"”", '"').replace(u"–", "-").replace(u"—", "-")
# COLLECT KEY COMPONENTS
try:
title = self.find_component("bookTitle", annotations)
except:
return False
regex = re.compile(r'_xx\d*xx$')
title = regex.sub("", title)
authors = self.find_component("authors", annotations)
citation = self.find_component("citation", annotations)
# CLEAN THE CITATION
regex = re.compile(r'</?i>', re.S)
citation = regex.sub('*', citation)
regex = re.compile(r'Citation \(.*?\): ', re.S)
citation = regex.sub('', citation).replace(" Kindle edition.", "")
# COLLECT ANNOTATIONS
annotationDict = {}
matchObject = re.finditer(
r"""<div class="noteHeading">\s+Highlight\(<span.*?>(?P<color>.*?)</span>\)((?P<section>.*?)Page (?P<page>\d+))?.*?Location (?P<location>\d+)\s+</div>\s+<div class="noteText">(?P<note>.*?)</div>""",
annotations,
flags=re.S
)
for match in matchObject:
location = int(match.group("location"))
location = "%(location)09d" % locals()
if match.group("page"):
try:
annotationDict[location] = {"color": match.group("color"), "page": match.group(
"page"), "section": self.clean(match.group("section"))[3:-2], "note": self.clean(match.group("note"))}
except:
print match.group("note")
sys.exit(0)
else:
try:
annotationDict[location] = {"color": match.group(
"color"), "note": self.clean(match.group("note"))}
except:
print match.group("note")
sys.exit(0)
# COLLECT PERSONAL NOTES
matchObject = re.finditer(
r"""<div class="noteHeading">\s+Note -( Page (?P<page>\d+))?.*?Location (?P<location>\d+)\s+</div>\s+<div class="noteText">(?P<note>.*?)</div>""",
annotations,
flags=re.S
)
for match in matchObject:
location = int(match.group("location"))
location = "%(location)09dnote" % locals()
if match.group("page"):
annotationDict[location] = {"color": None, "page": match.group(
"page"), "note": self.clean(match.group("note"))}
else:
annotationDict[location] = {
"color": None, "note": self.clean(match.group("note"))}
annotationDict = collections.OrderedDict(
sorted(annotationDict.items()))
mdContent = "\n# %(title)s\n\nAuthors: **%(authors)s**\n\n" % locals()
for k, v in annotationDict.iteritems():
mdContent += self.convertToMD(v) + "\n\n"
if len(annotationDict) == 0:
return False
pathToWriteFile = self.outputPath
try:
self.log.debug("attempting to open the file %s" %
(pathToWriteFile,))
writeFile = codecs.open(
pathToWriteFile, encoding='utf-8', mode='w')
except IOError, e:
message = 'could not open the file %s' % (pathToWriteFile,)
self.log.critical(message)
raise IOError(message)
writeFile.write(mdContent)
writeFile.close()
self.log.info('completed the ``convert`` method')
return pathToWriteFile
[docs] def clean(self, text):
return text.strip().replace(u"’", "'").replace(u"“ ", '"').replace(u"“", '"').replace(u"”", '"').replace(u"–", "-").replace(u"—", "-")
[docs] def find_component(self, divtag, annotations):
component = re.search(
r"""<div class="%(divtag)s">(.*?)</div>""" % locals(), annotations, re.S)
return self.clean(component.group(1))
[docs] def convertToMD(self, kindleNote):
if kindleNote["color"] == None:
return "**NOTE**\n: " + kindleNote["note"].replace("\n", " ")
mdType = colorCode[kindleNote["color"]]
if mdType == "code":
return "```\n" + kindleNote["note"] + "\n```"
elif mdType == "text":
return kindleNote["note"]
elif mdType == "header":
regex = re.compile(r'_xx\d*xx$')
kindleNote["note"] = regex.sub("", kindleNote["note"])
return "## " + kindleNote["note"].replace("\n", " ").replace(" ", " ").replace(" ", " ").replace(" ", " ")
elif mdType == "quote":
return "> " + kindleNote["note"].replace("\n", "> ")
# xt-class-method
# 5. @flagged: what actions of the base class(es) need ammending? ammend them here
# Override Method Attributes
# method-override-tmpx