kindle manager

2024-04-03 15:08:22 +08:00
parent 6b3c0f3b6b
commit 6df3ce42a3
459 changed files with 164651 additions and 4690 deletions
--- a/mobiparse/mobi/mobi_ncx.py
+++ b/mobiparse/mobi/mobi_ncx.py
@@ -0,0 +1,132 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
+
+from __future__ import unicode_literals, division, absolute_import, print_function
+
+import os
+from .unipath import pathof
+from loguru import logger
+
+
+import re
+import json
+
+# note: re requites the pattern to be the exact same type as the data to be searched in python3
+# but u"" is not allowed for the pattern itself only b""
+
+'''
+NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application
+to Digital Talking Books, eBooks, and general web content models.                                                
+This DTD is an XML application that layers navigation functionality on top of SMIL 2.0  content.                                       
+The NCX defines a navigation path/model that may be applied upon existing publications,
+without modification of the existing publication source, so long as the navigation targets within
+the source publication can be directly referenced via a URI.                      
+
+http://www.daisy.org/z3986/2005/ncx-2005-1.dtd
+'''
+
+from .mobi_utils import toBase32
+from .mobi_index import MobiIndex
+
+DEBUG_NCX = False
+
+class ncxExtract:
+    def __init__(self, mh):
+        self.mh = mh
+        self.sect = self.mh.sect
+        self.isNCX = False
+        self.mi = MobiIndex(self.sect)
+        self.ncxidx = self.mh.ncxidx
+        self.indx_data = None
+
+    def parseNCX(self):
+        indx_data = []
+        tag_fieldname_map = {
+            1: ["pos", 0],
+            2: ["len", 0],
+            3: ["noffs", 0],
+            4: ["hlvl", 0],
+            5: ["koffs", 0],
+            6: ["pos_fid", 0],
+            21: ["parent", 0],
+            22: ["child1", 0],
+            23: ["childn", 0],
+        }
+        if self.ncxidx != 0xFFFFFFFF:
+            outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
+            if DEBUG_NCX:
+                logger.debug("ctoc_text {}".format(ctoc_text))
+                logger.debug("outtbl {}".format(outtbl))
+            num = 0
+            for [text, tagMap] in outtbl:
+                tmp = {
+                    "name": text.decode("utf-8"),
+                    "pos": -1,
+                    "len": 0,
+                    "noffs": -1,
+                    "text": "Unknown Text",
+                    "hlvl": -1,
+                    "kind": "Unknown Kind",
+                    "pos_fid": None,
+                    "parent": -1,
+                    "child1": -1,
+                    "childn": -1,
+                    "num": num,
+                }
+                for tag in tag_fieldname_map:
+                    [fieldname, i] = tag_fieldname_map[tag]
+                    if tag in tagMap:
+                        fieldvalue = tagMap[tag][i]
+                        if tag == 6:
+                            pos_fid = toBase32(fieldvalue, 4).decode("utf-8")
+                            fieldvalue2 = tagMap[tag][i + 1]
+                            pos_off = toBase32(fieldvalue2, 10).decode("utf-8")
+                            fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off)
+                        tmp[fieldname] = fieldvalue
+                        if tag == 3:
+                            toctext = ctoc_text.get(fieldvalue, "Unknown Text")
+                            toctext = toctext.decode(self.mh.codec)
+                            tmp["text"] = toctext
+                        if tag == 5:
+                            kindtext = ctoc_text.get(fieldvalue, "Unknown Kind")
+                            kindtext = kindtext.decode(self.mh.codec)
+                            tmp["kind"] = kindtext
+                indx_data.append(tmp)
+
+                # CGDBG
+                '''
+                record number:  3
+                name:  03
+                position 461377  length:  465358  => position/150 = real page number
+                text:  第二章 青铜时代——单机游戏
+                kind:  Unknown Kind
+                heading level:  0 => level of section
+                parent: -1  => record number of previous level of section
+                first child:  15  last child:  26 => range of record number of next level section
+                pos_fid is  kindle:pos:fid:0023:off:0000000000
+                '''
+                if DEBUG_NCX:
+                    print("record number: ", num)
+                    print(
+                        "name: ", tmp["name"],
+                    )
+                    print("position", tmp["pos"], " length: ", tmp["len"])
+                    print("text: ", tmp["text"])
+                    print("kind: ", tmp["kind"])
+                    print("heading level: ", tmp["hlvl"])
+                    print("parent:", tmp["parent"])
+                    print(
+                        "first child: ", tmp["child1"], " last child: ", tmp["childn"]
+                    )
+                    print("pos_fid is ", tmp["pos_fid"])
+                    print("\n\n")
+                num += 1
+        self.indx_data = indx_data
+
+        # {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0}
+        # {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11}
+        #print('indx_data {}'.format(json.dumps(indx_data, indent=4, sort_keys=True, ensure_ascii=False)))
+
+        return indx_data
+