Files
xhsautopublisher/backup/parse_markdown_file.ai.py
2025-09-05 17:20:14 +08:00

155 lines
4.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from collections import defaultdict
import re
import json
def parse_markdown_file(file_path):
xhsdata = defaultdict(dict)
filename = file_path.split('/')[-1]
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 解析 YAML 元数据
metadata = {}
current_key = None
current_list = []
in_front_matter = False
for line in lines:
if line.strip() == '---':
if in_front_matter:
break
in_front_matter = True
continue
if not in_front_matter:
continue
line = line.rstrip('\n') # 去除行尾换行符
# 处理列表项
if line.startswith(' - '):
if current_key:
current_list.append(line[4:].strip())
continue
# 处理键值对
if ':' in line:
# 保存之前的列表项
if current_key and current_list:
metadata[current_key] = current_list
current_list = []
key, *value_parts = line.split(':', 1)
key = key.strip()
value = value_parts[0].strip() if value_parts else ''
# 检查是否为多行值的开始
if value == '':
current_key = key
current_list = []
else:
metadata[key] = value
current_key = None
# 保存最后一个列表项
if current_key and current_list:
metadata[current_key] = current_list
# 处理标签和分类
for field in ['tags', 'xhstags', 'categories']:
if field in metadata:
value = metadata[field]
if isinstance(value, str):
# 处理 "[标签1,标签2]" 格式
if value.startswith('[') and value.endswith(']'):
value = value[1:-1].replace('"', '').split(',')
value = [tag.strip() for tag in value if tag.strip()]
else:
value = [value]
xhsdata[filename][field] = value
# 处理图片路径(重点优化)
if 'image' in metadata:
image_path = metadata['image']
# 移除开头的 /img/ 或 img/,保留后续路径
clean_image = re.sub(r'^(/?img/)', '', image_path)
xhsdata[filename]['image'] = clean_image
# 将剩余元数据添加到结果中
for key, value in metadata.items():
if key not in xhsdata[filename]:
xhsdata[filename][key] = value
# 解析内容和图片
xhsdata[filename]['content'] = []
xhsdata[filename]['images'] = []
in_xhs_section = False
current_paragraph = []
for line in lines:
if line.strip() == '<!--xhs-->':
in_xhs_section = True
continue
if in_xhs_section:
if line.strip() == '':
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
in_xhs_section = False
else:
# 提取图片路径
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
for match in image_matches:
clean_match = re.sub(r'^(/|img/)', '', match)
xhsdata[filename]['images'].append(clean_match)
# 去除图片标记后的文本
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
if text:
current_paragraph.append(text)
# 处理最后一个段落
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
return xhsdata
# 修改文件路径
file_path = 'markdown/test.md'
result = parse_markdown_file(file_path)
print(json.dumps(result,indent=2, ensure_ascii=False))
''' 豆包自动生成,提示语:
用python实现解析markdown文件如附件所示要求
1. 解析的内容放到defaultdict数据结构中
xhsdata =
{
"filename": {
"title": "Labubu爆火现象",
"date": "2025-06-19 11:00",
"tags": ["潮玩","labubu"……],
"where": "杭州市西湖风景名胜区",
"open": "yes",
"content": ["paragraph1","paragraph2","paragraph3"……],
"images":["image1","image2","image3"……],
……
}
}
2. 只解析标识<!--xhs-->的段落(直到空行)。
3. 文字内容按段放到xhsdata的content[]中的元素里。
4. ![img](img/path/xxx)![img](/img/path/yyy)![[path/zzz]]为图片解析后放到xhsdata的images[]中,如下["path/xxx","path/yyy","path/zzz"]
元数据中:
image路径中去掉/img/或img/;
tags、xhstags、categories格式为[游玩,生活]或者
- 游玩
- 生活
解析为tags[]和categories[]
'''