#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function from .compatibility_utils import PY2, unicode_str from loguru import logger if PY2: range = xrange import struct # note: struct pack, unpack, unpack_from all require bytestring format # data all the way up to at least python 2.7.5, python 3 okay with bytestring import re # note: re requites the pattern to be the exact same type as the data to be searched in python3 # but u"" is not allowed for the pattern itself only b"" _TABLE = [ ("m", 1000), ("cm", 900), ("d", 500), ("cd", 400), ("c", 100), ("xc", 90), ("l", 50), ("xl", 40), ("x", 10), ("ix", 9), ("v", 5), ("iv", 4), ("i", 1), ] def int_to_roman(i): parts = [] num = i for letter, value in _TABLE: while value <= num: num -= value parts.append(letter) return "".join(parts) def roman_to_int(s): result = 0 rnstr = s for letter, value in _TABLE: while rnstr.startswith(letter): result += value rnstr = rnstr[len(letter) :] return result _pattern = r"""\(([^\)]*)\)""" _tup_pattern = re.compile(_pattern, re.IGNORECASE) def _parseNames(numpages, data): data = unicode_str(data) pagenames = [] pageMap = "" for i in range(numpages): pagenames.append(None) for m in re.finditer(_tup_pattern, data): tup = m.group(1) if pageMap != "": pageMap += "," pageMap += "(" + tup + ")" spos, nametype, svalue = tup.split(",") # print(spos, nametype, svalue) if nametype == "a" or nametype == "r": svalue = int(svalue) spos = int(spos) for i in range(spos - 1, numpages): if nametype == "r": pname = int_to_roman(svalue) svalue += 1 elif nametype == "a": pname = "%s" % svalue svalue += 1 elif nametype == "c": sp = svalue.find("|") if sp == -1: pname = svalue else: pname = svalue[0:sp] svalue = svalue[sp + 1 :] else: logger.debug("Error: unknown page numbering type %s" % nametype) pagenames[i] = pname return pagenames, pageMap class PageMapProcessor: def __init__(self, mh, data): self.mh = mh self.data = data self.pagenames = [] self.pageoffsets = [] self.pageMap = "" self.pm_len = 0 self.pm_nn = 0 self.pn_bits = 0 self.pmoff = None self.pmstr = "" logger.debug("Extracting Page Map Information") (rev_len,) = struct.unpack_from(b">L", self.data, 0x10) # skip over header, revision string length data, and revision string ptr = 0x14 + rev_len pm_1, self.pm_len, self.pm_nn, self.pm_bits = struct.unpack_from( b">4H", self.data, ptr ) # print(pm_1, self.pm_len, self.pm_nn, self.pm_bits) self.pmstr = self.data[ptr + 8 : ptr + 8 + self.pm_len] self.pmoff = self.data[ptr + 8 + self.pm_len :] offsize = b">L" offwidth = 4 if self.pm_bits == 16: offsize = b">H" offwidth = 2 ptr = 0 for i in range(self.pm_nn): (od,) = struct.unpack_from(offsize, self.pmoff, ptr) ptr += offwidth self.pageoffsets.append(od) self.pagenames, self.pageMap = _parseNames(self.pm_nn, self.pmstr) def getPageMap(self): return self.pageMap def getNames(self): return self.pagenames def getOffsets(self): return self.pageoffsets # page-map.xml will be unicode but encoded to utf-8 immediately before being written to a file def generateKF8PageMapXML(self, k8proc): pagemapxml = '\n' for i in range(len(self.pagenames)): pos = self.pageoffsets[i] name = self.pagenames[i] if name is not None and name != "": [pn, dir, filename, skelpos, skelend, aidtext] = k8proc.getSkelInfo(pos) idtext = unicode_str(k8proc.getPageIDTag(pos)) linktgt = unicode_str(filename) if idtext != "": linktgt += "#" + idtext pagemapxml += '\n' % (name, dir, linktgt) pagemapxml += "\n" return pagemapxml def generateAPNX(self, apnx_meta): if apnx_meta["format"] == "MOBI_8": content_header = ( '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","format":"%(format)s","fileRevisionId":"1","acr":"%(acr)s"}' % apnx_meta ) else: content_header = ( '{"contentGuid":"%(contentGuid)s","asin":"%(asin)s","cdeType":"%(cdeType)s","fileRevisionId":"1"}' % apnx_meta ) content_header = content_header.encode("utf-8") page_header = '{"asin":"%(asin)s","pageMap":"%(pageMap)s"}' % apnx_meta page_header = page_header.encode("utf-8") apnx = struct.pack(b">H", 1) + struct.pack(b">H", 1) apnx += struct.pack(b">I", 12 + len(content_header)) apnx += struct.pack(b">I", len(content_header)) apnx += content_header apnx += struct.pack(b">H", 1) apnx += struct.pack(b">H", len(page_header)) apnx += struct.pack(b">H", self.pm_nn) apnx += struct.pack(b">H", 32) apnx += page_header for page in self.pageoffsets: apnx += struct.pack(b">L", page) return apnx