Initial commit

2025-09-05 17:10:11 +08:00
parent 80a1bee0e0
commit 67f33b2d80
138 changed files with 24432 additions and 0 deletions
--- a/backup/parse_markdown_file.py.bk2
+++ b/backup/parse_markdown_file.py.bk2
@@ -0,0 +1,141 @@
+#########################################################
+## @file   : parse_markdown_file.py
+## @desc   : parse hugo markdown file
+## @create : 2025/6/22
+## @author : Chengan，doubao AI
+## @email  : douboer@gmail.com
+#########################################################
+
+from collections import defaultdict
+import re
+import json
+
+def parse_markdown_file(file_path):
+    xhsdata = defaultdict(dict)
+    filename = file_path.split('/')[-1]
+
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+
+    # 解析 YAML 元数据
+    metadata = {}
+    current_key = None
+    current_list = []
+    in_front_matter = False
+    
+    for line in lines:
+        if line.strip() == '---':
+            if in_front_matter:
+                break
+            in_front_matter = True
+            continue
+        if not in_front_matter:
+            continue
+            
+        line = line.rstrip('\n')  # 去除行尾换行符
+        
+        # 处理列表项
+        if line.startswith('  - '):
+            if current_key:
+                current_list.append(line[4:].strip())
+            continue
+            
+        # 处理键值对
+        if ':' in line:
+            # 保存之前的列表项
+            if current_key and current_list:
+                metadata[current_key] = current_list
+                current_list = []
+                
+            key, *value_parts = line.split(':', 1)
+            key = key.strip()
+            value = value_parts[0].strip() if value_parts else ''
+            
+            # 检查是否为多行值的开始
+            if value == '':
+                current_key = key
+                current_list = []
+            else:
+                metadata[key] = value
+                current_key = None
+
+    # 保存最后一个列表项
+    if current_key and current_list:
+        metadata[current_key] = current_list
+
+    # 处理标签和分类
+    for field in ['tags', 'xhstags', 'categories']:
+        if field in metadata:
+            value = metadata[field]
+            if isinstance(value, str):
+                # 处理 "[标签1,标签2]" 格式
+                if value.startswith('[') and value.endswith(']'):
+                    value = value[1:-1].replace('"', '').split(',')
+                    value = [tag.strip() for tag in value if tag.strip()]
+                else:
+                    value = [value]
+            xhsdata[filename][field] = value
+
+    # 处理图片路径（重点优化）
+    if 'image' in metadata:
+        image_path = metadata['image']
+        # 移除开头的 /img/ 或 img/，保留后续路径
+        clean_image = re.sub(r'^(/?img/)', '', image_path)
+        xhsdata[filename]['image'] = clean_image
+
+    # 将剩余元数据添加到结果中
+    for key, value in metadata.items():
+        if key not in xhsdata[filename]:
+            xhsdata[filename][key] = value
+
+    # 解析内容和图片
+    xhsdata[filename]['content'] = []
+    xhsdata[filename]['images'] = []
+    in_xhs_section = False
+    current_paragraph = []
+
+    for line in lines:
+        if line.strip() == '<!--xhs-->':
+            # 遇到新的 <!--xhs--> 标记，处理当前段落并开始新的
+            if current_paragraph and in_xhs_section:
+                xhsdata[filename]['content'].append(' '.join(current_paragraph))
+                current_paragraph = []
+            in_xhs_section = True
+            continue
+        '''
+        if line.strip() == '<!--xhs-->':
+            in_xhs_section = True
+            continue
+        '''
+        if in_xhs_section:
+            if line.strip() == '':
+                if current_paragraph:
+                    xhsdata[filename]['content'].append(' '.join(current_paragraph))
+                    current_paragraph = []
+                in_xhs_section = False
+            else:
+                # 提取图片路径
+                image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
+                image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
+                for match in image_matches:
+                    clean_match = re.sub(r'^(/|img/)', '', match)
+                    xhsdata[filename]['images'].append(clean_match)
+
+                # 去除图片标记后的文本
+                text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
+                text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
+                if text:
+                    current_paragraph.append(text)
+
+    # 处理最后一个段落
+    if current_paragraph:
+        xhsdata[filename]['content'].append(' '.join(current_paragraph))
+
+    return xhsdata
+
+# 修改文件路径
+file_path = 'markdown/test.md'
+result = parse_markdown_file(file_path)
+print(json.dumps(result,indent=2, ensure_ascii=False))
+
+