#!/usr/bin/env python # -*- coding: utf-8 -*- # vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab from __future__ import unicode_literals, division, absolute_import, print_function import os from .unipath import pathof from loguru import logger import re # note: re requites the pattern to be the exact same type as the data to be searched in python3 # but u"" is not allowed for the pattern itself only b"" ''' NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application to Digital Talking Books, eBooks, and general web content models. This DTD is an XML application that layers navigation functionality on top of SMIL 2.0 content. The NCX defines a navigation path/model that may be applied upon existing publications, without modification of the existing publication source, so long as the navigation targets within the source publication can be directly referenced via a URI. http://www.daisy.org/z3986/2005/ncx-2005-1.dtd ''' from .mobi_utils import toBase32 from .mobi_index import MobiIndex DEBUG_NCX = True class ncxExtract: def __init__(self, mh): self.mh = mh self.sect = self.mh.sect self.isNCX = False self.mi = MobiIndex(self.sect) self.ncxidx = self.mh.ncxidx self.indx_data = None def parseNCX(self): indx_data = [] tag_fieldname_map = { 1: ["pos", 0], 2: ["len", 0], 3: ["noffs", 0], 4: ["hlvl", 0], 5: ["koffs", 0], 6: ["pos_fid", 0], 21: ["parent", 0], 22: ["child1", 0], 23: ["childn", 0], } if self.ncxidx != 0xFFFFFFFF: outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX") if DEBUG_NCX: logger.debug("ctoc_text {}".format(ctoc_text)) logger.debug("outtbl {}".format(outtbl)) num = 0 for [text, tagMap] in outtbl: tmp = { "name": text.decode("utf-8"), "pos": -1, "len": 0, "noffs": -1, "text": "Unknown Text", "hlvl": -1, "kind": "Unknown Kind", "pos_fid": None, "parent": -1, "child1": -1, "childn": -1, "num": num, } for tag in tag_fieldname_map: [fieldname, i] = tag_fieldname_map[tag] if tag in tagMap: fieldvalue = tagMap[tag][i] if tag == 6: pos_fid = toBase32(fieldvalue, 4).decode("utf-8") fieldvalue2 = tagMap[tag][i + 1] pos_off = toBase32(fieldvalue2, 10).decode("utf-8") fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off) tmp[fieldname] = fieldvalue if tag == 3: toctext = ctoc_text.get(fieldvalue, "Unknown Text") toctext = toctext.decode(self.mh.codec) tmp["text"] = toctext if tag == 5: kindtext = ctoc_text.get(fieldvalue, "Unknown Kind") kindtext = kindtext.decode(self.mh.codec) tmp["kind"] = kindtext indx_data.append(tmp) # CGDBG ''' record number: 3 name: 03 position 461377 length: 465358 => position/150 = real page number text: 第二章 青铜时代­——单机游戏 kind: Unknown Kind heading level: 0 => level of section parent: -1 => record number of previous level of section first child: 15 last child: 26 => range of record number of next level section pos_fid is kindle:pos:fid:0023:off:0000000000 ''' if DEBUG_NCX: print("record number: ", num) print( "name: ", tmp["name"], ) print("position", tmp["pos"], " length: ", tmp["len"]) print("text: ", tmp["text"]) print("kind: ", tmp["kind"]) print("heading level: ", tmp["hlvl"]) print("parent:", tmp["parent"]) print( "first child: ", tmp["child1"], " last child: ", tmp["childn"] ) print("pos_fid is ", tmp["pos_fid"]) print("\n\n") num += 1 self.indx_data = indx_data # {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0} # {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11} print(indx_data) return indx_data def writeNCX(self, metadata): # build the xml self.isNCX = True logger.debug("Write ncx") # write the ncx file # build the xml xml = self.buildNCX( metadata["Title"][0], metadata["UniqueID"][0], metadata.get("Language")[0], ) # write the ncx file # ncxname = os.path.join(self.files.mobi7dir, self.files.getInputFileBasename() + '.ncx') ncxname = os.path.join(self.files.mobi7dir, "toc.ncx") with open(pathof(ncxname), "wb") as f: f.write(xml.encode("utf-8")) def buildNCX(self): indx_data = self.indx_data # recursive part def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): if start > len(indx_data) or end > len(indx_data): print("Warning: missing INDX child entries", start, end, len(indx_data)) return "" if DEBUG_NCX: logger.debug("recursINDX lvl %d from %d to %d" % (lvl, start, end)) xml = "" if start <= 0: start = 0 if end <= 0: end = len(indx_data) if lvl > max_lvl: max_lvl = lvl indent = " " * (2 + lvl) for i in range(start, end): e = indx_data[i] if not e["hlvl"] == lvl: continue # open entry num += 1 link = "%s#filepos%d" % (htmlfile, e["pos"]) tagid = "np_%d" % num entry = ncx_entry % (tagid, num, e["text"], link) entry = re.sub(re.compile("^", re.M), indent, entry, 0) xml += entry + "\n" # recurs if e["child1"] >= 0: xmlrec, max_lvl, num = recursINDX( max_lvl, num, lvl + 1, e["child1"], e["childn"] + 1 ) xml += xmlrec # close entry xml += indent + "\n" return xml, max_lvl, num body, max_lvl, num = recursINDX() header = ncx_header % (lang, ident, max_lvl + 1, title) ncx = header + body + ncx_footer if not len(indx_data) == num: print("Warning: different number of entries in NCX", len(indx_data), num) return ncx def buildK8NCX(self, indx_data, title, ident, lang): ncx_header = """ %s """ ncx_footer = """ """ ncx_entry = """ %s """ # recursive part def recursINDX(max_lvl=0, num=0, lvl=0, start=-1, end=-1): if start > len(indx_data) or end > len(indx_data): print("Warning: missing INDX child entries", start, end, len(indx_data)) return "" if DEBUG_NCX: logger.debug("recursINDX lvl %d from %d to %d" % (lvl, start, end)) xml = "" if start <= 0: start = 0 if end <= 0: end = len(indx_data) if lvl > max_lvl: max_lvl = lvl indent = " " * (2 + lvl) for i in range(start, end): e = indx_data[i] htmlfile = e["filename"] desttag = e["idtag"] if not e["hlvl"] == lvl: continue # open entry num += 1 if desttag == "": link = "Text/%s" % htmlfile else: link = "Text/%s#%s" % (htmlfile, desttag) tagid = "np_%d" % num entry = ncx_entry % (tagid, num, e["text"], link) entry = re.sub(re.compile("^", re.M), indent, entry, 0) xml += entry + "\n" # recurs if e["child1"] >= 0: xmlrec, max_lvl, num = recursINDX( max_lvl, num, lvl + 1, e["child1"], e["childn"] + 1 ) xml += xmlrec # close entry xml += indent + "\n" return xml, max_lvl, num body, max_lvl, num = recursINDX() header = ncx_header % (lang, ident, max_lvl + 1, title) ncx = header + body + ncx_footer if not len(indx_data) == num: print("Warning: different number of entries in NCX", len(indx_data), num) return ncx def writeK8NCX(self, ncx_data, metadata): # build the xml self.isNCX = True logger.debug("Write K8 ncx") xml = self.buildK8NCX( ncx_data, metadata["Title"][0], metadata["UniqueID"][0], metadata.get("Language")[0], ) ncxname = os.path.join('./', 'k8toc.ncx.json') with open(pathof(ncxname), "wb") as f: f.write(xml.encode("utf-8"))