xhsautopublisher/backup/parse_markdown_file.ai.py



from collections import defaultdict
import re
import json

def parse_markdown_file(file_path):
    xhsdata = defaultdict(dict)
    filename = file_path.split('/')[-1]

    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 解析 YAML 元数据
    metadata = {}
    current_key = None
    current_list = []
    in_front_matter = False

    for line in lines:
        if line.strip() == '---':
            if in_front_matter:
                break
            in_front_matter = True
            continue
        if not in_front_matter:
            continue

        line = line.rstrip('\n')  # 去除行尾换行符

        # 处理列表项
        if line.startswith('  - '):
            if current_key:
                current_list.append(line[4:].strip())
            continue

        # 处理键值对
        if ':' in line:
            # 保存之前的列表项
            if current_key and current_list:
                metadata[current_key] = current_list
                current_list = []

            key, *value_parts = line.split(':', 1)
            key = key.strip()
            value = value_parts[0].strip() if value_parts else ''

            # 检查是否为多行值的开始
            if value == '':
                current_key = key
                current_list = []
            else:
                metadata[key] = value
                current_key = None

    # 保存最后一个列表项
    if current_key and current_list:
        metadata[current_key] = current_list

    # 处理标签和分类
    for field in ['tags', 'xhstags', 'categories']:
        if field in metadata:
            value = metadata[field]
            if isinstance(value, str):
                # 处理 "[标签1,标签2]" 格式
                if value.startswith('[') and value.endswith(']'):
                    value = value[1:-1].replace('"', '').split(',')
                    value = [tag.strip() for tag in value if tag.strip()]
                else:
                    value = [value]
            xhsdata[filename][field] = value

    # 处理图片路径（重点优化）
    if 'image' in metadata:
        image_path = metadata['image']
        # 移除开头的 /img/ 或 img/，保留后续路径
        clean_image = re.sub(r'^(/?img/)', '', image_path)
        xhsdata[filename]['image'] = clean_image

    # 将剩余元数据添加到结果中
    for key, value in metadata.items():
        if key not in xhsdata[filename]:
            xhsdata[filename][key] = value

    # 解析内容和图片
    xhsdata[filename]['content'] = []
    xhsdata[filename]['images'] = []
    in_xhs_section = False
    current_paragraph = []

    for line in lines:
        if line.strip() == '<!--xhs-->':
            in_xhs_section = True
            continue
        if in_xhs_section:
            if line.strip() == '':
                if current_paragraph:
                    xhsdata[filename]['content'].append(' '.join(current_paragraph))
                    current_paragraph = []
                in_xhs_section = False
            else:
                # 提取图片路径
                image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
                image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
                for match in image_matches:
                    clean_match = re.sub(r'^(/|img/)', '', match)
                    xhsdata[filename]['images'].append(clean_match)

                # 去除图片标记后的文本
                text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
                text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
                if text:
                    current_paragraph.append(text)

    # 处理最后一个段落
    if current_paragraph:
        xhsdata[filename]['content'].append(' '.join(current_paragraph))

    return xhsdata

# 修改文件路径
file_path = 'markdown/test.md'
result = parse_markdown_file(file_path)
print(json.dumps(result,indent=2, ensure_ascii=False))


''' 豆包自动生成，提示语：
用python实现，解析markdown文件，如附件所示，要求：
1. 解析的内容放到defaultdict数据结构中：
xhsdata =
{
  "filename": {
      "title": "Labubu爆火现象",
      "date": "2025-06-19 11:00",
      "tags": ["潮玩","labubu"……],
      "where": "杭州市西湖风景名胜区",
      "open": "yes",
      "content": ["paragraph1","paragraph2","paragraph3"……],
      "images":["image1","image2","image3"……],
      ……
   }
}
2.  只解析标识<!--xhs-->的段落（直到空行）。
3. 文字内容按段放到xhsdata的content[]中的元素里。
4. ![img](img/path/xxx)，![img](/img/path/yyy)，![[path/zzz]]为图片，解析后放到xhsdata的images[]中，如下["path/xxx","path/yyy","path/zzz"]

元数据中：
image路径中去掉/img/或img/;
tags、xhstags、categories格式为[游玩,生活]或者
  - 游玩
  - 生活
解析为tags[]和categories[]
'''