155 lines
4.9 KiB
Python
155 lines
4.9 KiB
Python
|
||
|
||
|
||
from collections import defaultdict
|
||
import re
|
||
import json
|
||
|
||
def parse_markdown_file(file_path):
|
||
xhsdata = defaultdict(dict)
|
||
filename = file_path.split('/')[-1]
|
||
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
lines = file.readlines()
|
||
|
||
# 解析 YAML 元数据
|
||
metadata = {}
|
||
current_key = None
|
||
current_list = []
|
||
in_front_matter = False
|
||
|
||
for line in lines:
|
||
if line.strip() == '---':
|
||
if in_front_matter:
|
||
break
|
||
in_front_matter = True
|
||
continue
|
||
if not in_front_matter:
|
||
continue
|
||
|
||
line = line.rstrip('\n') # 去除行尾换行符
|
||
|
||
# 处理列表项
|
||
if line.startswith(' - '):
|
||
if current_key:
|
||
current_list.append(line[4:].strip())
|
||
continue
|
||
|
||
# 处理键值对
|
||
if ':' in line:
|
||
# 保存之前的列表项
|
||
if current_key and current_list:
|
||
metadata[current_key] = current_list
|
||
current_list = []
|
||
|
||
key, *value_parts = line.split(':', 1)
|
||
key = key.strip()
|
||
value = value_parts[0].strip() if value_parts else ''
|
||
|
||
# 检查是否为多行值的开始
|
||
if value == '':
|
||
current_key = key
|
||
current_list = []
|
||
else:
|
||
metadata[key] = value
|
||
current_key = None
|
||
|
||
# 保存最后一个列表项
|
||
if current_key and current_list:
|
||
metadata[current_key] = current_list
|
||
|
||
# 处理标签和分类
|
||
for field in ['tags', 'xhstags', 'categories']:
|
||
if field in metadata:
|
||
value = metadata[field]
|
||
if isinstance(value, str):
|
||
# 处理 "[标签1,标签2]" 格式
|
||
if value.startswith('[') and value.endswith(']'):
|
||
value = value[1:-1].replace('"', '').split(',')
|
||
value = [tag.strip() for tag in value if tag.strip()]
|
||
else:
|
||
value = [value]
|
||
xhsdata[filename][field] = value
|
||
|
||
# 处理图片路径(重点优化)
|
||
if 'image' in metadata:
|
||
image_path = metadata['image']
|
||
# 移除开头的 /img/ 或 img/,保留后续路径
|
||
clean_image = re.sub(r'^(/?img/)', '', image_path)
|
||
xhsdata[filename]['image'] = clean_image
|
||
|
||
# 将剩余元数据添加到结果中
|
||
for key, value in metadata.items():
|
||
if key not in xhsdata[filename]:
|
||
xhsdata[filename][key] = value
|
||
|
||
# 解析内容和图片
|
||
xhsdata[filename]['content'] = []
|
||
xhsdata[filename]['images'] = []
|
||
in_xhs_section = False
|
||
current_paragraph = []
|
||
|
||
for line in lines:
|
||
if line.strip() == '<!--xhs-->':
|
||
in_xhs_section = True
|
||
continue
|
||
if in_xhs_section:
|
||
if line.strip() == '':
|
||
if current_paragraph:
|
||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||
current_paragraph = []
|
||
in_xhs_section = False
|
||
else:
|
||
# 提取图片路径
|
||
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
|
||
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
|
||
for match in image_matches:
|
||
clean_match = re.sub(r'^(/|img/)', '', match)
|
||
xhsdata[filename]['images'].append(clean_match)
|
||
|
||
# 去除图片标记后的文本
|
||
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
|
||
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
|
||
if text:
|
||
current_paragraph.append(text)
|
||
|
||
# 处理最后一个段落
|
||
if current_paragraph:
|
||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||
|
||
return xhsdata
|
||
|
||
# 修改文件路径
|
||
file_path = 'markdown/test.md'
|
||
result = parse_markdown_file(file_path)
|
||
print(json.dumps(result,indent=2, ensure_ascii=False))
|
||
|
||
|
||
''' 豆包自动生成,提示语:
|
||
用python实现,解析markdown文件,如附件所示,要求:
|
||
1. 解析的内容放到defaultdict数据结构中:
|
||
xhsdata =
|
||
{
|
||
"filename": {
|
||
"title": "Labubu爆火现象",
|
||
"date": "2025-06-19 11:00",
|
||
"tags": ["潮玩","labubu"……],
|
||
"where": "杭州市西湖风景名胜区",
|
||
"open": "yes",
|
||
"content": ["paragraph1","paragraph2","paragraph3"……],
|
||
"images":["image1","image2","image3"……],
|
||
……
|
||
}
|
||
}
|
||
2. 只解析标识<!--xhs-->的段落(直到空行)。
|
||
3. 文字内容按段放到xhsdata的content[]中的元素里。
|
||
4. ,,![[path/zzz]]为图片,解析后放到xhsdata的images[]中,如下["path/xxx","path/yyy","path/zzz"]
|
||
|
||
元数据中:
|
||
image路径中去掉/img/或img/;
|
||
tags、xhstags、categories格式为[游玩,生活]或者
|
||
- 游玩
|
||
- 生活
|
||
解析为tags[]和categories[]
|
||
'''
|