kindle manager
This commit is contained in:
575
mobimaster/mobi/mobi_k8proc.py
Executable file
575
mobimaster/mobi/mobi_k8proc.py
Executable file
@@ -0,0 +1,575 @@
|
||||
#!/usr/bin/env python
|
||||
# -*- coding: utf-8 -*-
|
||||
# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
|
||||
|
||||
from __future__ import unicode_literals, division, absolute_import, print_function
|
||||
|
||||
from .compatibility_utils import PY2, bstr, utf8_str
|
||||
from loguru import logger
|
||||
|
||||
if PY2:
|
||||
range = xrange
|
||||
|
||||
import os
|
||||
|
||||
import struct
|
||||
|
||||
# note: struct pack, unpack, unpack_from all require bytestring format
|
||||
# data all the way up to at least python 2.7.5, python 3 okay with bytestring
|
||||
|
||||
import re
|
||||
|
||||
# note: re requites the pattern to be the exact same type as the data to be searched in python3
|
||||
# but u"" is not allowed for the pattern itself only b""
|
||||
|
||||
from .mobi_index import MobiIndex
|
||||
from .mobi_utils import fromBase32
|
||||
from .unipath import pathof
|
||||
|
||||
_guide_types = [
|
||||
b"cover",
|
||||
b"title-page",
|
||||
b"toc",
|
||||
b"index",
|
||||
b"glossary",
|
||||
b"acknowledgements",
|
||||
b"bibliography",
|
||||
b"colophon",
|
||||
b"copyright-page",
|
||||
b"dedication",
|
||||
b"epigraph",
|
||||
b"foreward",
|
||||
b"loi",
|
||||
b"lot",
|
||||
b"notes",
|
||||
b"preface",
|
||||
b"text",
|
||||
]
|
||||
|
||||
# locate beginning and ending positions of tag with specific aid attribute
|
||||
def locate_beg_end_of_tag(ml, aid):
|
||||
pattern = utf8_str(r"""<[^>]*\said\s*=\s*['"]%s['"][^>]*>""" % aid)
|
||||
aid_pattern = re.compile(pattern, re.IGNORECASE)
|
||||
for m in re.finditer(aid_pattern, ml):
|
||||
plt = m.start()
|
||||
pgt = ml.find(b">", plt + 1)
|
||||
return plt, pgt
|
||||
return 0, 0
|
||||
|
||||
|
||||
# iterate over all tags in block in reverse order, i.e. last ta to first tag
|
||||
def reverse_tag_iter(block):
|
||||
end = len(block)
|
||||
while True:
|
||||
pgt = block.rfind(b">", 0, end)
|
||||
if pgt == -1:
|
||||
break
|
||||
plt = block.rfind(b"<", 0, pgt)
|
||||
if plt == -1:
|
||||
break
|
||||
yield block[plt : pgt + 1]
|
||||
end = plt
|
||||
|
||||
|
||||
class K8Processor:
|
||||
def __init__(self, mh, sect, files, debug=False):
|
||||
self.sect = sect
|
||||
self.files = files
|
||||
self.mi = MobiIndex(sect)
|
||||
self.mh = mh
|
||||
self.skelidx = mh.skelidx
|
||||
self.fragidx = mh.fragidx
|
||||
self.guideidx = mh.guideidx
|
||||
self.fdst = mh.fdst
|
||||
self.flowmap = {}
|
||||
self.flows = None
|
||||
self.flowinfo = []
|
||||
self.parts = None
|
||||
self.partinfo = []
|
||||
self.linked_aids = set()
|
||||
self.fdsttbl = [0, 0xFFFFFFFF]
|
||||
self.DEBUG = debug
|
||||
|
||||
# read in and parse the FDST info which is very similar in format to the Palm DB section
|
||||
# parsing except it provides offsets into rawML file and not the Palm DB file
|
||||
# this is needed to split up the final css, svg, etc flow section
|
||||
# that can exist at the end of the rawML file
|
||||
if self.fdst != 0xFFFFFFFF:
|
||||
header = self.sect.loadSection(self.fdst)
|
||||
if header[0:4] == b"FDST":
|
||||
(num_sections,) = struct.unpack_from(b">L", header, 0x08)
|
||||
self.fdsttbl = struct.unpack_from(
|
||||
bstr(">%dL" % (num_sections * 2)), header, 12
|
||||
)[::2] + (mh.rawSize,)
|
||||
sect.setsectiondescription(self.fdst, "KF8 FDST INDX")
|
||||
if self.DEBUG:
|
||||
logger.debug("\nFDST Section Map: %d sections" % num_sections)
|
||||
for j in range(num_sections):
|
||||
logger.debug(
|
||||
"Section %d: 0x%08X - 0x%08X"
|
||||
% (j, self.fdsttbl[j], self.fdsttbl[j + 1])
|
||||
)
|
||||
else:
|
||||
logger.debug("\nError: K8 Mobi with Missing FDST info")
|
||||
|
||||
# read/process skeleton index info to create the skeleton table
|
||||
skeltbl = []
|
||||
if self.skelidx != 0xFFFFFFFF:
|
||||
# for i in range(2):
|
||||
# fname = 'skel%04d.dat' % i
|
||||
# data = self.sect.loadSection(self.skelidx + i)
|
||||
# with open(pathof(fname), 'wb') as f:
|
||||
# f.write(data)
|
||||
outtbl, ctoc_text = self.mi.getIndexData(self.skelidx, "KF8 Skeleton")
|
||||
fileptr = 0
|
||||
for [text, tagMap] in outtbl:
|
||||
# file number, skeleton name, fragtbl record count, start position, length
|
||||
skeltbl.append(
|
||||
[fileptr, text, tagMap[1][0], tagMap[6][0], tagMap[6][1]]
|
||||
)
|
||||
fileptr += 1
|
||||
self.skeltbl = skeltbl
|
||||
if self.DEBUG:
|
||||
logger.debug("\nSkel Table: %d entries" % len(self.skeltbl))
|
||||
logger.debug(
|
||||
"table: filenum, skeleton name, frag tbl record count, start position, length"
|
||||
)
|
||||
for j in range(len(self.skeltbl)):
|
||||
logger.debug(self.skeltbl[j])
|
||||
|
||||
# read/process the fragment index to create the fragment table
|
||||
fragtbl = []
|
||||
if self.fragidx != 0xFFFFFFFF:
|
||||
# for i in range(3):
|
||||
# fname = 'frag%04d.dat' % i
|
||||
# data = self.sect.loadSection(self.fragidx + i)
|
||||
# with open(pathof(fname), 'wb') as f:
|
||||
# f.write(data)
|
||||
outtbl, ctoc_text = self.mi.getIndexData(self.fragidx, "KF8 Fragment")
|
||||
for [text, tagMap] in outtbl:
|
||||
# insert position, ctoc offset (aidtext), file number, sequence number, start position, length
|
||||
ctocoffset = tagMap[2][0]
|
||||
ctocdata = ctoc_text[ctocoffset]
|
||||
fragtbl.append(
|
||||
[
|
||||
int(text),
|
||||
ctocdata,
|
||||
tagMap[3][0],
|
||||
tagMap[4][0],
|
||||
tagMap[6][0],
|
||||
tagMap[6][1],
|
||||
]
|
||||
)
|
||||
self.fragtbl = fragtbl
|
||||
if self.DEBUG:
|
||||
logger.debug("\nFragment Table: %d entries" % len(self.fragtbl))
|
||||
logger.debug(
|
||||
"table: file position, link id text, file num, sequence number, start position, length"
|
||||
)
|
||||
for j in range(len(self.fragtbl)):
|
||||
logger.debug(self.fragtbl[j])
|
||||
|
||||
# read / process guide index for guide elements of opf
|
||||
guidetbl = []
|
||||
if self.guideidx != 0xFFFFFFFF:
|
||||
# for i in range(3):
|
||||
# fname = 'guide%04d.dat' % i
|
||||
# data = self.sect.loadSection(self.guideidx + i)
|
||||
# with open(pathof(fname), 'wb') as f:
|
||||
# f.write(data)
|
||||
outtbl, ctoc_text = self.mi.getIndexData(
|
||||
self.guideidx, "KF8 Guide elements)"
|
||||
)
|
||||
for [text, tagMap] in outtbl:
|
||||
# ref_type, ref_title, frag number
|
||||
ctocoffset = tagMap[1][0]
|
||||
ref_title = ctoc_text[ctocoffset]
|
||||
ref_type = text
|
||||
fileno = None
|
||||
if 3 in tagMap:
|
||||
fileno = tagMap[3][0]
|
||||
if 6 in tagMap:
|
||||
fileno = tagMap[6][0]
|
||||
guidetbl.append([ref_type, ref_title, fileno])
|
||||
self.guidetbl = guidetbl
|
||||
if self.DEBUG:
|
||||
logger.debug("\nGuide Table: %d entries" % len(self.guidetbl))
|
||||
logger.debug("table: ref_type, ref_title, fragtbl entry number")
|
||||
for j in range(len(self.guidetbl)):
|
||||
logger.debug(self.guidetbl[j])
|
||||
|
||||
def buildParts(self, rawML):
|
||||
# now split the rawML into its flow pieces
|
||||
self.flows = []
|
||||
for j in range(0, len(self.fdsttbl) - 1):
|
||||
start = self.fdsttbl[j]
|
||||
end = self.fdsttbl[j + 1]
|
||||
self.flows.append(rawML[start:end])
|
||||
|
||||
# the first piece represents the xhtml text
|
||||
text = self.flows[0]
|
||||
self.flows[0] = b""
|
||||
|
||||
# walk the <skeleton> and fragment tables to build original source xhtml files
|
||||
# *without* destroying any file position information needed for later href processing
|
||||
# and create final list of file separation start: stop points and etc in partinfo
|
||||
if self.DEBUG:
|
||||
logger.debug("\nRebuilding flow piece 0: the main body of the ebook")
|
||||
self.parts = []
|
||||
self.partinfo = []
|
||||
fragptr = 0
|
||||
baseptr = 0
|
||||
cnt = 0
|
||||
filename = "part%04d.xhtml" % cnt
|
||||
for [skelnum, skelname, fragcnt, skelpos, skellen] in self.skeltbl:
|
||||
baseptr = skelpos + skellen
|
||||
skeleton = text[skelpos:baseptr]
|
||||
aidtext = "0"
|
||||
for i in range(fragcnt):
|
||||
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[
|
||||
fragptr
|
||||
]
|
||||
aidtext = idtext[12:-2]
|
||||
if i == 0:
|
||||
filename = "part%04d.xhtml" % filenum
|
||||
slice = text[baseptr : baseptr + length]
|
||||
insertpos = insertpos - skelpos
|
||||
head = skeleton[:insertpos]
|
||||
tail = skeleton[insertpos:]
|
||||
actual_inspos = insertpos
|
||||
if tail.find(b">") < tail.find(b"<") or head.rfind(b">") < head.rfind(
|
||||
b"<"
|
||||
):
|
||||
# There is an incomplete tag in either the head or tail.
|
||||
# This can happen for some badly formed KF8 files
|
||||
logger.debug(
|
||||
"The fragment table for %s has incorrect insert position. Calculating manually."
|
||||
% skelname
|
||||
)
|
||||
bp, ep = locate_beg_end_of_tag(skeleton, aidtext)
|
||||
if bp != ep:
|
||||
actual_inspos = ep + 1 + startpos
|
||||
if insertpos != actual_inspos:
|
||||
print(
|
||||
"fixed corrupt fragment table insert position",
|
||||
insertpos + skelpos,
|
||||
actual_inspos + skelpos,
|
||||
)
|
||||
insertpos = actual_inspos
|
||||
self.fragtbl[fragptr][0] = actual_inspos + skelpos
|
||||
skeleton = skeleton[0:insertpos] + slice + skeleton[insertpos:]
|
||||
baseptr = baseptr + length
|
||||
fragptr += 1
|
||||
cnt += 1
|
||||
self.parts.append(skeleton)
|
||||
self.partinfo.append([skelnum, "Text", filename, skelpos, baseptr, aidtext])
|
||||
|
||||
assembled_text = b"".join(self.parts)
|
||||
if self.DEBUG:
|
||||
outassembled = os.path.join(self.files.k8dir, "assembled_text.dat")
|
||||
with open(pathof(outassembled), "wb") as f:
|
||||
f.write(assembled_text)
|
||||
|
||||
# The primary css style sheet is typically stored next followed by any
|
||||
# snippets of code that were previously inlined in the
|
||||
# original xhtml but have been stripped out and placed here.
|
||||
# This can include local CDATA snippets and and svg sections.
|
||||
|
||||
# The problem is that for most browsers and ereaders, you can not
|
||||
# use <img src="imageXXXX.svg" /> to import any svg image that itself
|
||||
# properly uses an <image/> tag to import some raster image - it
|
||||
# should work according to the spec but does not for almost all browsers
|
||||
# and ereaders and causes epub validation issues because those raster
|
||||
# images are in manifest but not in xhtml text - since they only
|
||||
# referenced from an svg image
|
||||
|
||||
# So we need to check the remaining flow pieces to see if they are css
|
||||
# or svg images. if svg images, we must check if they have an <image />
|
||||
# and if so inline them into the xhtml text pieces.
|
||||
|
||||
# there may be other sorts of pieces stored here but until we see one
|
||||
# in the wild to reverse engineer we won't be able to tell
|
||||
self.flowinfo.append([None, None, None, None])
|
||||
svg_tag_pattern = re.compile(br"""(<svg[^>]*>)""", re.IGNORECASE)
|
||||
image_tag_pattern = re.compile(br"""(<image[^>]*>)""", re.IGNORECASE)
|
||||
for j in range(1, len(self.flows)):
|
||||
flowpart = self.flows[j]
|
||||
nstr = "%04d" % j
|
||||
m = re.search(svg_tag_pattern, flowpart)
|
||||
if m is not None:
|
||||
# svg
|
||||
ptype = b"svg"
|
||||
start = m.start()
|
||||
m2 = re.search(image_tag_pattern, flowpart)
|
||||
if m2 is not None:
|
||||
pformat = b"inline"
|
||||
pdir = None
|
||||
fname = None
|
||||
# strip off anything before <svg if inlining
|
||||
flowpart = flowpart[start:]
|
||||
else:
|
||||
pformat = b"file"
|
||||
pdir = "Images"
|
||||
fname = "svgimg" + nstr + ".svg"
|
||||
else:
|
||||
# search for CDATA and if exists inline it
|
||||
if flowpart.find(b"[CDATA[") >= 0:
|
||||
ptype = b"css"
|
||||
flowpart = b'<style type="text/css">\n' + flowpart + b"\n</style>\n"
|
||||
pformat = b"inline"
|
||||
pdir = None
|
||||
fname = None
|
||||
else:
|
||||
# css - assume as standalone css file
|
||||
ptype = b"css"
|
||||
pformat = b"file"
|
||||
pdir = "Styles"
|
||||
fname = "style" + nstr + ".css"
|
||||
|
||||
self.flows[j] = flowpart
|
||||
self.flowinfo.append([ptype, pformat, pdir, fname])
|
||||
|
||||
if self.DEBUG:
|
||||
logger.debug("\nFlow Map: %d entries" % len(self.flowinfo))
|
||||
for fi in self.flowinfo:
|
||||
logger.debug(fi)
|
||||
logger.debug("\n")
|
||||
|
||||
logger.debug(
|
||||
"\nXHTML File Part Position Information: %d entries"
|
||||
% len(self.partinfo)
|
||||
)
|
||||
for pi in self.partinfo:
|
||||
logger.debug(pi)
|
||||
|
||||
if False: # self.Debug:
|
||||
# dump all of the locations of the aid tags used in TEXT
|
||||
# find id links only inside of tags
|
||||
# inside any < > pair find all "aid=' and return whatever is inside the quotes
|
||||
# [^>]* means match any amount of chars except for '>' char
|
||||
# [^'"] match any amount of chars except for the quote character
|
||||
# \s* means match any amount of whitespace
|
||||
logger.debug("\npositions of all aid= pieces")
|
||||
id_pattern = re.compile(
|
||||
br"""<[^>]*\said\s*=\s*['"]([^'"]*)['"][^>]*>""", re.IGNORECASE
|
||||
)
|
||||
for m in re.finditer(id_pattern, rawML):
|
||||
[filename, partnum, start, end] = self.getFileInfo(m.start())
|
||||
[seqnum, idtext] = self.getFragTblInfo(m.start())
|
||||
value = fromBase32(m.group(1))
|
||||
logger.debug(
|
||||
" aid: %s value: %d at: %d -> part: %d, start: %d, end: %d"
|
||||
% (m.group(1), value, m.start(), partnum, start, end)
|
||||
)
|
||||
logger.debug(" %s fragtbl entry %d" % (idtext, seqnum))
|
||||
|
||||
return
|
||||
|
||||
# get information fragment table entry by pos
|
||||
def getFragTblInfo(self, pos):
|
||||
for j in range(len(self.fragtbl)):
|
||||
[insertpos, idtext, filenum, seqnum, startpos, length] = self.fragtbl[j]
|
||||
if pos >= insertpos and pos < (insertpos + length):
|
||||
# why are these "in: and before: added here
|
||||
return seqnum, b"in: " + idtext
|
||||
if pos < insertpos:
|
||||
return seqnum, b"before: " + idtext
|
||||
return None, None
|
||||
|
||||
# get information about the part (file) that exists at pos in original rawML
|
||||
def getFileInfo(self, pos):
|
||||
for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
|
||||
if pos >= start and pos < end:
|
||||
return filename, partnum, start, end
|
||||
return None, None, None, None
|
||||
|
||||
# accessor functions to properly protect the internal structure
|
||||
def getNumberOfParts(self):
|
||||
return len(self.parts)
|
||||
|
||||
def getPart(self, i):
|
||||
if i >= 0 and i < len(self.parts):
|
||||
return self.parts[i]
|
||||
return None
|
||||
|
||||
def getPartInfo(self, i):
|
||||
if i >= 0 and i < len(self.partinfo):
|
||||
return self.partinfo[i]
|
||||
return None
|
||||
|
||||
def getNumberOfFlows(self):
|
||||
return len(self.flows)
|
||||
|
||||
def getFlow(self, i):
|
||||
# note flows[0] is empty - it was all of the original text
|
||||
if i > 0 and i < len(self.flows):
|
||||
return self.flows[i]
|
||||
return None
|
||||
|
||||
def getFlowInfo(self, i):
|
||||
# note flowinfo[0] is empty - it was all of the original text
|
||||
if i > 0 and i < len(self.flowinfo):
|
||||
return self.flowinfo[i]
|
||||
return None
|
||||
|
||||
def getIDTagByPosFid(self, posfid, offset):
|
||||
# first convert kindle:pos:fid and offset info to position in file
|
||||
# (fromBase32 can handle both string types on input)
|
||||
row = fromBase32(posfid)
|
||||
off = fromBase32(offset)
|
||||
[insertpos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[row]
|
||||
pos = insertpos + off
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
if fname is None:
|
||||
# pos does not exist
|
||||
# default to skeleton pos instead
|
||||
print(
|
||||
"Link To Position", pos, "does not exist, retargeting to top of target"
|
||||
)
|
||||
pos = self.skeltbl[filenum][3]
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
# an existing "id=" or "name=" attribute must exist in original xhtml otherwise it would not have worked for linking.
|
||||
# Amazon seems to have added its own additional "aid=" inside tags whose contents seem to represent
|
||||
# some position information encoded into Base32 name.
|
||||
# so find the closest "id=" before position the file by actually searching in that file
|
||||
idtext = self.getIDTag(pos)
|
||||
return fname, idtext
|
||||
|
||||
def getIDTag(self, pos):
|
||||
# find the first tag with a named anchor (name or id attribute) before pos
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
if pn is None and skelpos is None:
|
||||
logger.debug("Error: getIDTag - no file contains %s" % pos)
|
||||
textblock = self.parts[pn]
|
||||
npos = pos - skelpos
|
||||
# if npos inside a tag then search all text before the its end of tag marker
|
||||
pgt = textblock.find(b">", npos)
|
||||
plt = textblock.find(b"<", npos)
|
||||
if plt == npos or pgt < plt:
|
||||
npos = pgt + 1
|
||||
# find id and name attributes only inside of tags
|
||||
# use a reverse tag search since that is faster
|
||||
# inside any < > pair find "id=" and "name=" attributes return it
|
||||
# [^>]* means match any amount of chars except for '>' char
|
||||
# [^'"] match any amount of chars except for the quote character
|
||||
# \s* means match any amount of whitespace
|
||||
textblock = textblock[0:npos]
|
||||
id_pattern = re.compile(
|
||||
br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
name_pattern = re.compile(
|
||||
br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
aid_pattern = re.compile(br"""<[^>]+\s(?:aid|AID)\s*=\s*['"]([^'"]+)['"]""")
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
# any ids in the body should default to top of file
|
||||
if tag[0:6] == b"<body ":
|
||||
return b""
|
||||
if tag[0:6] != b"<meta ":
|
||||
m = id_pattern.match(tag) or name_pattern.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
m = aid_pattern.match(tag)
|
||||
if m is not None:
|
||||
self.linked_aids.add(m.group(1))
|
||||
return b"aid-" + m.group(1)
|
||||
return b""
|
||||
|
||||
# do we need to do deep copying
|
||||
def setParts(self, parts):
|
||||
assert len(parts) == len(self.parts)
|
||||
for i in range(len(parts)):
|
||||
self.parts[i] = parts[i]
|
||||
|
||||
# do we need to do deep copying
|
||||
def setFlows(self, flows):
|
||||
assert len(flows) == len(self.flows)
|
||||
for i in range(len(flows)):
|
||||
self.flows[i] = flows[i]
|
||||
|
||||
# get information about the part (file) that exists at pos in original rawML
|
||||
def getSkelInfo(self, pos):
|
||||
for [partnum, pdir, filename, start, end, aidtext] in self.partinfo:
|
||||
if pos >= start and pos < end:
|
||||
return [partnum, pdir, filename, start, end, aidtext]
|
||||
return [None, None, None, None, None, None]
|
||||
|
||||
# fileno is actually a reference into fragtbl (a fragment)
|
||||
def getGuideText(self):
|
||||
guidetext = b""
|
||||
for [ref_type, ref_title, fileno] in self.guidetbl:
|
||||
if ref_type == b"thumbimagestandard":
|
||||
continue
|
||||
if ref_type not in _guide_types and not ref_type.startswith(b"other."):
|
||||
if ref_type == b"start":
|
||||
ref_type = b"text"
|
||||
else:
|
||||
ref_type = b"other." + ref_type
|
||||
[pos, idtext, filenum, seqnm, startpos, length] = self.fragtbl[fileno]
|
||||
[pn, pdir, filename, skelpos, skelend, aidtext] = self.getSkelInfo(pos)
|
||||
idtext = self.getIDTag(pos)
|
||||
linktgt = filename.encode("utf-8")
|
||||
if idtext != b"":
|
||||
linktgt += b"#" + idtext
|
||||
guidetext += (
|
||||
b'<reference type="'
|
||||
+ ref_type
|
||||
+ b'" title="'
|
||||
+ ref_title
|
||||
+ b'" href="'
|
||||
+ utf8_str(pdir)
|
||||
+ b"/"
|
||||
+ linktgt
|
||||
+ b'" />\n'
|
||||
)
|
||||
# opf is encoded utf-8 so must convert any titles properly
|
||||
guidetext = (guidetext.decode(self.mh.codec)).encode("utf-8")
|
||||
return guidetext
|
||||
|
||||
def getPageIDTag(self, pos):
|
||||
# find the first tag with a named anchor (name or id attribute) before pos
|
||||
# but page map offsets need to little more leeway so if the offset points
|
||||
# into a tag look for the next ending tag "/>" or "</" and start your search from there.
|
||||
fname, pn, skelpos, skelend = self.getFileInfo(pos)
|
||||
if pn is None and skelpos is None:
|
||||
logger.debug("Error: getIDTag - no file contains %s" % pos)
|
||||
textblock = self.parts[pn]
|
||||
npos = pos - skelpos
|
||||
# if npos inside a tag then search all text before next ending tag
|
||||
pgt = textblock.find(b">", npos)
|
||||
plt = textblock.find(b"<", npos)
|
||||
if plt == npos or pgt < plt:
|
||||
# we are in a tag
|
||||
# so find first ending tag
|
||||
pend1 = textblock.find(b"/>", npos)
|
||||
pend2 = textblock.find(b"</", npos)
|
||||
if pend1 != -1 and pend2 != -1:
|
||||
pend = min(pend1, pend2)
|
||||
else:
|
||||
pend = max(pend1, pend2)
|
||||
if pend != -1:
|
||||
npos = pend
|
||||
else:
|
||||
npos = pgt + 1
|
||||
# find id and name attributes only inside of tags
|
||||
# use a reverse tag search since that is faster
|
||||
# inside any < > pair find "id=" and "name=" attributes return it
|
||||
# [^>]* means match any amount of chars except for '>' char
|
||||
# [^'"] match any amount of chars except for the quote character
|
||||
# \s* means match any amount of whitespace
|
||||
textblock = textblock[0:npos]
|
||||
id_pattern = re.compile(
|
||||
br"""<[^>]*\sid\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
name_pattern = re.compile(
|
||||
br"""<[^>]*\sname\s*=\s*['"]([^'"]*)['"]""", re.IGNORECASE
|
||||
)
|
||||
for tag in reverse_tag_iter(textblock):
|
||||
# any ids in the body should default to top of file
|
||||
if tag[0:6] == b"<body ":
|
||||
return b""
|
||||
if tag[0:6] != b"<meta ":
|
||||
m = id_pattern.match(tag) or name_pattern.match(tag)
|
||||
if m is not None:
|
||||
return m.group(1)
|
||||
return b""
|
||||
Reference in New Issue
Block a user