133 lines
5.4 KiB
Python
Executable File
133 lines
5.4 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
|
|
|
from __future__ import unicode_literals, division, absolute_import, print_function
|
|
|
|
import os
|
|
from .unipath import pathof
|
|
from loguru import logger
|
|
|
|
|
|
import re
|
|
import json
|
|
|
|
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
|
# but u"" is not allowed for the pattern itself only b""
|
|
|
|
'''
|
|
NCX (Navigation Control for XML applications) is a generalized navigation definition DTD for application
|
|
to Digital Talking Books, eBooks, and general web content models.
|
|
This DTD is an XML application that layers navigation functionality on top of SMIL 2.0 content.
|
|
The NCX defines a navigation path/model that may be applied upon existing publications,
|
|
without modification of the existing publication source, so long as the navigation targets within
|
|
the source publication can be directly referenced via a URI.
|
|
|
|
http://www.daisy.org/z3986/2005/ncx-2005-1.dtd
|
|
'''
|
|
|
|
from .mobi_utils import toBase32
|
|
from .mobi_index import MobiIndex
|
|
|
|
DEBUG_NCX = False
|
|
|
|
class ncxExtract:
|
|
def __init__(self, mh):
|
|
self.mh = mh
|
|
self.sect = self.mh.sect
|
|
self.isNCX = False
|
|
self.mi = MobiIndex(self.sect)
|
|
self.ncxidx = self.mh.ncxidx
|
|
self.indx_data = None
|
|
|
|
def parseNCX(self):
|
|
indx_data = []
|
|
tag_fieldname_map = {
|
|
1: ["pos", 0],
|
|
2: ["len", 0],
|
|
3: ["noffs", 0],
|
|
4: ["hlvl", 0],
|
|
5: ["koffs", 0],
|
|
6: ["pos_fid", 0],
|
|
21: ["parent", 0],
|
|
22: ["child1", 0],
|
|
23: ["childn", 0],
|
|
}
|
|
if self.ncxidx != 0xFFFFFFFF:
|
|
outtbl, ctoc_text = self.mi.getIndexData(self.ncxidx, "NCX")
|
|
if DEBUG_NCX:
|
|
logger.debug("ctoc_text {}".format(ctoc_text))
|
|
logger.debug("outtbl {}".format(outtbl))
|
|
num = 0
|
|
for [text, tagMap] in outtbl:
|
|
tmp = {
|
|
"name": text.decode("utf-8"),
|
|
"pos": -1,
|
|
"len": 0,
|
|
"noffs": -1,
|
|
"text": "Unknown Text",
|
|
"hlvl": -1,
|
|
"kind": "Unknown Kind",
|
|
"pos_fid": None,
|
|
"parent": -1,
|
|
"child1": -1,
|
|
"childn": -1,
|
|
"num": num,
|
|
}
|
|
for tag in tag_fieldname_map:
|
|
[fieldname, i] = tag_fieldname_map[tag]
|
|
if tag in tagMap:
|
|
fieldvalue = tagMap[tag][i]
|
|
if tag == 6:
|
|
pos_fid = toBase32(fieldvalue, 4).decode("utf-8")
|
|
fieldvalue2 = tagMap[tag][i + 1]
|
|
pos_off = toBase32(fieldvalue2, 10).decode("utf-8")
|
|
fieldvalue = "kindle:pos:fid:%s:off:%s" % (pos_fid, pos_off)
|
|
tmp[fieldname] = fieldvalue
|
|
if tag == 3:
|
|
toctext = ctoc_text.get(fieldvalue, "Unknown Text")
|
|
toctext = toctext.decode(self.mh.codec)
|
|
tmp["text"] = toctext
|
|
if tag == 5:
|
|
kindtext = ctoc_text.get(fieldvalue, "Unknown Kind")
|
|
kindtext = kindtext.decode(self.mh.codec)
|
|
tmp["kind"] = kindtext
|
|
indx_data.append(tmp)
|
|
|
|
# CGDBG
|
|
'''
|
|
record number: 3
|
|
name: 03
|
|
position 461377 length: 465358 => position/150 = real page number
|
|
text: 第二章 青铜时代——单机游戏
|
|
kind: Unknown Kind
|
|
heading level: 0 => level of section
|
|
parent: -1 => record number of previous level of section
|
|
first child: 15 last child: 26 => range of record number of next level section
|
|
pos_fid is kindle:pos:fid:0023:off:0000000000
|
|
'''
|
|
if DEBUG_NCX:
|
|
print("record number: ", num)
|
|
print(
|
|
"name: ", tmp["name"],
|
|
)
|
|
print("position", tmp["pos"], " length: ", tmp["len"])
|
|
print("text: ", tmp["text"])
|
|
print("kind: ", tmp["kind"])
|
|
print("heading level: ", tmp["hlvl"])
|
|
print("parent:", tmp["parent"])
|
|
print(
|
|
"first child: ", tmp["child1"], " last child: ", tmp["childn"]
|
|
)
|
|
print("pos_fid is ", tmp["pos_fid"])
|
|
print("\n\n")
|
|
num += 1
|
|
self.indx_data = indx_data
|
|
|
|
# {'name': '00', 'pos': 167, 'len': 24798, 'noffs': 0, 'text': '版权信息', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 0}
|
|
# {'name': '0B', 'pos': 67932, 'len': 3274, 'noffs': 236, 'text': '8.希罗多德', 'hlvl': 0, 'kind': 'Unknown Kind', 'pos_fid': None, 'parent': -1, 'child1': -1, 'childn': -1, 'num': 11}
|
|
#print('indx_data {}'.format(json.dumps(indx_data, indent=4, sort_keys=True, ensure_ascii=False)))
|
|
|
|
return indx_data
|
|
|