Initial commit

2025-09-05 17:10:11 +08:00
parent 80a1bee0e0
commit 67f33b2d80
138 changed files with 24432 additions and 0 deletions
--- a/backup/parse_markdown_file.py
+++ b/backup/parse_markdown_file.py
@@ -0,0 +1,335 @@
+
+#########################################################
+## @file   : parse_markdown_file.py
+## @desc   : parse hugo markdown file
+## @create : 2025/6/22
+## @author : Chengan，doubao AI
+## @email  : douboer@gmail.com
+#########################################################
+
+import logging
+import json
+import re
+
+from pathlib import Path
+from collections import defaultdict
+
+# 配置日志
+def setup_logger(log_file=None):
+    logger = logging.getLogger('markdown_parser')
+    logger.setLevel(logging.INFO)
+
+    # 创建控制台处理器
+    console_handler = logging.StreamHandler()
+    console_handler.setLevel(logging.INFO)
+
+    # 创建文件处理器（如果指定了日志文件）
+    if log_file:
+        file_handler = logging.FileHandler(log_file)
+        file_handler.setLevel(logging.DEBUG)
+        logger.addHandler(file_handler)
+
+    # 设置日志格式
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    console_handler.setFormatter(formatter)
+    if log_file:
+        file_handler.setFormatter(formatter)
+
+    # 添加处理器
+    logger.addHandler(console_handler)
+
+    return logger
+
+class MarkdownParser:
+    def __init__(self, log_file=None):
+        self.logger = setup_logger(log_file)
+        self.logger.info("MarkdownParser initialized")
+
+    def parse_markdown_file(self, file_path):
+        """解析 Markdown 文件，提取元数据和 XHS 内容"""
+        xhsdata = defaultdict(dict)
+
+        try:
+            # 验证文件路径
+            file_path = Path(file_path)
+            if not file_path.exists():
+                self.logger.error(f"文件不存在: {file_path}")
+                raise FileNotFoundError(f"文件不存在: {file_path}")
+
+            if not file_path.is_file():
+                self.logger.error(f"不是有效的文件: {file_path}")
+                raise ValueError(f"不是有效的文件: {file_path}")
+
+            filename = file_path.name
+            self.logger.info(f"开始解析文件: {filename}")
+
+            with open(file_path, 'r', encoding='utf-8') as file:
+                lines = file.readlines()
+
+            # 解析 YAML 元数据
+            metadata = self._parse_metadata(lines)
+
+            # 处理标签和分类
+            self._process_tags_categories(metadata, xhsdata, filename)
+
+            # 处理图片路径
+            self._process_image_path(metadata, xhsdata, filename)
+
+            # 添加剩余元数据
+            for key, value in metadata.items():
+                if key not in xhsdata[filename]:
+                    xhsdata[filename][key] = value
+
+            # 解析内容和图片
+            self._parse_content_images(lines, xhsdata, filename)
+
+            self.logger.info(f"文件解析完成: {filename}")
+            return xhsdata
+
+        except Exception as e:
+            self.logger.exception(f"解析文件时发生错误: {file_path}")
+            raise
+
+    def _parse_metadata(self, lines):
+        """解析 Markdown 文件中的 YAML 元数据"""
+        metadata = {}
+        current_key = None
+        current_list = []
+        in_front_matter = False
+
+        try:
+            for line in lines:
+                if line.strip() == '---':
+                    if in_front_matter:
+                        break
+                    in_front_matter = True
+                    continue
+                if not in_front_matter:
+                    continue
+
+                line = line.rstrip('\n')  # 去除行尾换行符
+
+                # 处理列表项
+                if line.startswith('  - '):
+                    if current_key:
+                        current_list.append(line[4:].strip())
+                    continue
+
+                # 处理键值对
+                if ':' in line:
+                    # 保存之前的列表项
+                    if current_key and current_list:
+                        metadata[current_key] = current_list
+                        current_list = []
+
+                    key, *value_parts = line.split(':', 1)
+                    key = key.strip()
+                    value = value_parts[0].strip() if value_parts else ''
+
+                    # 检查是否为多行值的开始
+                    if value == '':
+                        current_key = key
+                        current_list = []
+                    else:
+                        metadata[key] = value
+                        current_key = None
+
+            # 保存最后一个列表项
+            if current_key and current_list:
+                metadata[current_key] = current_list
+
+            self.logger.debug(f"解析元数据完成: {metadata}")
+            return metadata
+
+        except Exception as e:
+            self.logger.exception("解析元数据时发生错误")
+            raise
+
+    def _process_tags_categories(self, metadata, xhsdata, filename):
+        """处理标签和分类字段"""
+        try:
+            for field in ['tags', 'xhstags', 'categories']:
+                if field in metadata:
+                    value = metadata[field]
+                    if isinstance(value, str):
+                        # 处理 "[标签1,标签2]" 格式
+                        if value.startswith('[') and value.endswith(']'):
+                            value = value[1:-1].replace('"', '').split(',')
+                            value = [tag.strip() for tag in value if tag.strip()]
+                        else:
+                            value = [value]
+                    xhsdata[filename][field] = value
+                    self.logger.debug(f"处理 {field}: {value}")
+
+        except Exception as e:
+            self.logger.exception(f"处理标签/分类时发生错误")
+            raise
+
+    def _process_image_path(self, metadata, xhsdata, filename):
+        """处理图片路径，移除/img/或img/前缀"""
+        try:
+            if 'image' in metadata:
+                image_path = metadata['image']
+                clean_image = re.sub(r'^(/?img/)', '', image_path)
+                xhsdata[filename]['image'] = clean_image
+                self.logger.debug(f"处理图片路径: {image_path} -> {clean_image}")
+
+        except Exception as e:
+            self.logger.exception(f"处理图片路径时发生错误")
+            raise
+
+    def _parse_content_images(self, lines, xhsdata, filename):
+        """解析内容和图片"""
+        try:
+            xhsdata[filename]['content'] = []
+            xhsdata[filename]['images'] = []
+            xhsdata[filename]['tables'] = []  # 新增表格存储字段
+            in_xhs_section = False
+            current_paragraph = []
+            in_table = False
+            table_rows = []
+            last_line_was_heading = False  # 标记上一行是否为标题
+
+            for line in lines:
+                if line.strip() == '<!--xhs-->':
+                    # 遇到新的 <!--xhs--> 标记，处理当前段落并开始新的
+                    if current_paragraph and in_xhs_section:
+                        xhsdata[filename]['content'].append(' '.join(current_paragraph))
+                        current_paragraph = []
+                        self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
+                    in_xhs_section = True
+                    last_line_was_heading = False  # 重置标题标记
+                    continue
+
+                if in_xhs_section:
+                    # 结束条件：空行或下一个 <!--xhs-->
+                    if line.strip() == '':
+                        if current_paragraph:
+                            xhsdata[filename]['content'].append(' '.join(current_paragraph))
+                            current_paragraph = []
+                            self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
+                        if in_table:
+                            xhsdata[filename]['tables'].append(table_rows)
+                            table_rows = []
+                            in_table = False
+                        in_xhs_section = False
+                        last_line_was_heading = False  # 重置标题标记
+                        continue
+
+                    # 检查是否为标题行
+                    if line.strip().startswith('#'):
+                        # 如果当前有累积的段落内容，先添加到content
+                        if current_paragraph:
+                            xhsdata[filename]['content'].append(' '.join(current_paragraph))
+                            current_paragraph = []
+                            self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
+                        # 将标题单独添加为一个段落
+                        xhsdata[filename]['content'].append(line.strip())
+                        self.logger.debug(f"添加标题: {line.strip()[:30]}...")
+                        last_line_was_heading = True  # 标记上一行是标题
+                        continue
+
+                    # 如果上一行是标题且当前行没有 <!--xhs--> 标识，则忽略当前行
+                    if last_line_was_heading:
+                        self.logger.debug(f"忽略标题后的内容: {line.strip()[:30]}...")
+                        continue
+
+                    # 提取图片路径
+                    image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
+                    image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
+                    for match in image_matches:
+                        clean_match = re.sub(r'^(/?img/)', '', match)
+                        xhsdata[filename]['images'].append(clean_match)
+                        self.logger.debug(f"提取图片: {clean_match}")
+
+                    # 去除图片标记后的文本
+                    text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
+                    text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
+
+                    # 检查是否为表格行
+                    if line.strip().startswith('|'):
+                        if not in_table:
+                            in_table = True
+                        row = [cell.strip() for cell in line.strip().strip('|').split('|')]
+                        table_rows.append(row)
+                    else:
+                        if in_table:
+                            xhsdata[filename]['tables'].append(table_rows)
+                            table_rows = []
+                            in_table = False
+                        if text:
+                            current_paragraph.append(text)
+                            last_line_was_heading = False  # 重置标题标记
+
+            # 处理最后一个段落
+            if current_paragraph and in_xhs_section:
+                xhsdata[filename]['content'].append(' '.join(current_paragraph))
+                self.logger.debug(f"添加最后一个段落: {xhsdata[filename]['content'][-1][:30]}...")
+            # 处理最后一个表格
+            if in_table:
+                xhsdata[filename]['tables'].append(table_rows)
+
+        except Exception as e:
+            self.logger.exception(f"解析内容和图片时发生错误")
+            raise
+
+    def render_to_html(self, xhsdata):
+        """将 xhsdata 内容渲染为 HTML"""
+        html_parts = []
+
+        for filename, data in xhsdata.items():
+            # 添加 xhstitle
+            if 'xhstitle' in data:
+                html_parts.append(f'<h1 style="font-size: 24px;">{data["xhstitle"]}</h1>')
+
+            # 添加 content
+            for item in data.get('content', []):
+                if item.startswith('#'):
+                    level = len(item.split(' ')[0])
+                    title_text = item.replace('#' * level, '').strip()
+                    font_size = 24 - (level - 1) * 2
+                    html_parts.append(f'<h{level} style="font-size: {font_size}px;">{title_text}</h{level}>')
+                else:
+                    html_parts.append(f'<p style="font-size: 16px;">{item}</p>')
+
+            # 添加 tables
+            for table in data.get('tables', []):
+                html_parts.append('<table border="1">')
+                for row in table:
+                    html_parts.append('<tr>')
+                    for cell in row:
+                        html_parts.append(f'<td style="font-size: 14px;">{cell}</td>')
+                    html_parts.append('</tr>')
+                html_parts.append('</table>')
+
+            # 添加空行，让 xhstags 与内容之间空 3 行
+            for _ in range(3): html_parts.append('<br>')
+
+            # 添加 xhstags，每个 tag 单独一行
+            if 'xhstags' in data:
+                for tag in data['xhstags']:
+                    html_parts.append(f'<span style="font-size: 14px;">#{tag}</span><br>')
+
+            # 添加 xhssign
+            if 'xhssign' in data:
+                html_parts.append(f'<br><br><br><span style="font-size: 14px;">{data["xhssign"]}</span><br>')
+
+        html = '\n'.join(html_parts)
+        return html
+
+# 示例使用
+if __name__ == "__main__":
+    parser = MarkdownParser(log_file='markdown_parser.log')
+    try:
+        file_path = 'markdown/test.md'
+        result = parser.parse_markdown_file(file_path)
+
+        print(json.dumps(result, indent=2, ensure_ascii=False))
+
+        html_output = parser.render_to_html(result)
+        # 以 GBK 编码保存到文件
+        with open('test.html', 'w', encoding='gbk') as f: f.write(html_output)
+
+    except Exception as e:
+        print(f"程序运行出错: {e}")
+