from collections import defaultdict import re import json def parse_markdown_file(file_path): xhsdata = defaultdict(dict) filename = file_path.split('/')[-1] with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() # 解析 YAML 元数据 metadata = {} current_key = None current_list = [] in_front_matter = False for line in lines: if line.strip() == '---': if in_front_matter: break in_front_matter = True continue if not in_front_matter: continue line = line.rstrip('\n') # 去除行尾换行符 # 处理列表项 if line.startswith(' - '): if current_key: current_list.append(line[4:].strip()) continue # 处理键值对 if ':' in line: # 保存之前的列表项 if current_key and current_list: metadata[current_key] = current_list current_list = [] key, *value_parts = line.split(':', 1) key = key.strip() value = value_parts[0].strip() if value_parts else '' # 检查是否为多行值的开始 if value == '': current_key = key current_list = [] else: metadata[key] = value current_key = None # 保存最后一个列表项 if current_key and current_list: metadata[current_key] = current_list # 处理标签和分类 for field in ['tags', 'xhstags', 'categories']: if field in metadata: value = metadata[field] if isinstance(value, str): # 处理 "[标签1,标签2]" 格式 if value.startswith('[') and value.endswith(']'): value = value[1:-1].replace('"', '').split(',') value = [tag.strip() for tag in value if tag.strip()] else: value = [value] xhsdata[filename][field] = value # 处理图片路径(重点优化) if 'image' in metadata: image_path = metadata['image'] # 移除开头的 /img/ 或 img/,保留后续路径 clean_image = re.sub(r'^(/?img/)', '', image_path) xhsdata[filename]['image'] = clean_image # 将剩余元数据添加到结果中 for key, value in metadata.items(): if key not in xhsdata[filename]: xhsdata[filename][key] = value # 解析内容和图片 xhsdata[filename]['content'] = [] xhsdata[filename]['images'] = [] in_xhs_section = False current_paragraph = [] for line in lines: if line.strip() == '': in_xhs_section = True continue if in_xhs_section: if line.strip() == '': if current_paragraph: xhsdata[filename]['content'].append(' '.join(current_paragraph)) current_paragraph = [] in_xhs_section = False else: # 提取图片路径 image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line) image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line)) for match in image_matches: clean_match = re.sub(r'^(/|img/)', '', match) xhsdata[filename]['images'].append(clean_match) # 去除图片标记后的文本 text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip() text = re.sub(r'!\[\[.*?\]\]', '', text).strip() if text: current_paragraph.append(text) # 处理最后一个段落 if current_paragraph: xhsdata[filename]['content'].append(' '.join(current_paragraph)) return xhsdata # 修改文件路径 file_path = 'markdown/test.md' result = parse_markdown_file(file_path) print(json.dumps(result,indent=2, ensure_ascii=False)) ''' 豆包自动生成,提示语: 用python实现,解析markdown文件,如附件所示,要求: 1. 解析的内容放到defaultdict数据结构中: xhsdata = { "filename": { "title": "Labubu爆火现象", "date": "2025-06-19 11:00", "tags": ["潮玩","labubu"……], "where": "杭州市西湖风景名胜区", "open": "yes", "content": ["paragraph1","paragraph2","paragraph3"……], "images":["image1","image2","image3"……], …… } } 2. 只解析标识的段落(直到空行)。 3. 文字内容按段放到xhsdata的content[]中的元素里。 4. ![img](img/path/xxx),![img](/img/path/yyy),![[path/zzz]]为图片,解析后放到xhsdata的images[]中,如下["path/xxx","path/yyy","path/zzz"] 元数据中: image路径中去掉/img/或img/; tags、xhstags、categories格式为[游玩,生活]或者 - 游玩 - 生活 解析为tags[]和categories[] '''