Initial commit

This commit is contained in:
douboer
2025-09-05 17:10:11 +08:00
parent 80a1bee0e0
commit 67f33b2d80
138 changed files with 24432 additions and 0 deletions

60
backup/config.py Normal file
View File

@@ -0,0 +1,60 @@
# config.py
#定义内容字体
fonts_path = {
'title': 'fonts/LXGWWenKai-Medium.TTF',
'subtitle': 'fonts/LXGWWenKai-Medium.TTF',
'body': 'fonts/LXGWWenKai-Light.TTF',
'signature': 'fonts/LXGWWenKai-Light.TTF'
}
#定义底图模版
templates = {
'minimal': {
'background': 'backgrounds/IMG_5784.JPG',
'padding': 50
},
'modern': {
'background': 'backgrounds/IMG_5789.JPG',
'padding': 60
},
'vintage': {
'background': 'backgrounds/IMG_5793.JPG',
'padding': 70
}
}
#定义各部分内容样式
styles = {
'title': {
'size': 48,
'letter_spacing': 0,
'color': (30, 30, 30),
'line_spacing': 10,
'top_spacing': 0,
'bottom_spacing': 20
},
'subtitle': {
'size': 32,
'letter_spacing': 0,
'color': (60, 60, 60),
'line_spacing': 10,
'top_spacing': 20,
'bottom_spacing': 20
},
'body': {
'size': 24,
'letter_spacing': 0,
'color': (20, 20, 20),
'line_spacing': 15
},
'signature': {
'size': 28,
'letter_spacing': 2,
'color': (80, 80, 80),
'position': 'right', # 水平位置left, center, right
'vertical_position': 'bottom', # 垂直位置bottom, flow
'offset': 30 # 距离底部的偏移量
}
}

31
backup/logger_utils.py Normal file
View File

@@ -0,0 +1,31 @@
import logging
# 公共的日志类
class CommonLogger:
def __init__(self, log_file=None):
self.logger = logging.getLogger(__name__)
self.logger.setLevel(logging.INFO)
# 创建控制台处理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
# 创建文件处理器(如果指定了日志文件)
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
self.logger.addHandler(file_handler)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
if log_file:
file_handler.setFormatter(formatter)
# 添加处理器
self.logger.addHandler(console_handler)
def get_logger(self):
return self.logger

View File

@@ -0,0 +1,154 @@
from collections import defaultdict
import re
import json
def parse_markdown_file(file_path):
xhsdata = defaultdict(dict)
filename = file_path.split('/')[-1]
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 解析 YAML 元数据
metadata = {}
current_key = None
current_list = []
in_front_matter = False
for line in lines:
if line.strip() == '---':
if in_front_matter:
break
in_front_matter = True
continue
if not in_front_matter:
continue
line = line.rstrip('\n') # 去除行尾换行符
# 处理列表项
if line.startswith(' - '):
if current_key:
current_list.append(line[4:].strip())
continue
# 处理键值对
if ':' in line:
# 保存之前的列表项
if current_key and current_list:
metadata[current_key] = current_list
current_list = []
key, *value_parts = line.split(':', 1)
key = key.strip()
value = value_parts[0].strip() if value_parts else ''
# 检查是否为多行值的开始
if value == '':
current_key = key
current_list = []
else:
metadata[key] = value
current_key = None
# 保存最后一个列表项
if current_key and current_list:
metadata[current_key] = current_list
# 处理标签和分类
for field in ['tags', 'xhstags', 'categories']:
if field in metadata:
value = metadata[field]
if isinstance(value, str):
# 处理 "[标签1,标签2]" 格式
if value.startswith('[') and value.endswith(']'):
value = value[1:-1].replace('"', '').split(',')
value = [tag.strip() for tag in value if tag.strip()]
else:
value = [value]
xhsdata[filename][field] = value
# 处理图片路径(重点优化)
if 'image' in metadata:
image_path = metadata['image']
# 移除开头的 /img/ 或 img/,保留后续路径
clean_image = re.sub(r'^(/?img/)', '', image_path)
xhsdata[filename]['image'] = clean_image
# 将剩余元数据添加到结果中
for key, value in metadata.items():
if key not in xhsdata[filename]:
xhsdata[filename][key] = value
# 解析内容和图片
xhsdata[filename]['content'] = []
xhsdata[filename]['images'] = []
in_xhs_section = False
current_paragraph = []
for line in lines:
if line.strip() == '<!--xhs-->':
in_xhs_section = True
continue
if in_xhs_section:
if line.strip() == '':
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
in_xhs_section = False
else:
# 提取图片路径
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
for match in image_matches:
clean_match = re.sub(r'^(/|img/)', '', match)
xhsdata[filename]['images'].append(clean_match)
# 去除图片标记后的文本
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
if text:
current_paragraph.append(text)
# 处理最后一个段落
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
return xhsdata
# 修改文件路径
file_path = 'markdown/test.md'
result = parse_markdown_file(file_path)
print(json.dumps(result,indent=2, ensure_ascii=False))
''' 豆包自动生成,提示语:
用python实现解析markdown文件如附件所示要求
1. 解析的内容放到defaultdict数据结构中
xhsdata =
{
"filename": {
"title": "Labubu爆火现象",
"date": "2025-06-19 11:00",
"tags": ["潮玩","labubu"……],
"where": "杭州市西湖风景名胜区",
"open": "yes",
"content": ["paragraph1","paragraph2","paragraph3"……],
"images":["image1","image2","image3"……],
……
}
}
2. 只解析标识<!--xhs-->的段落(直到空行)。
3. 文字内容按段放到xhsdata的content[]中的元素里。
4. ![img](img/path/xxx)![img](/img/path/yyy)![[path/zzz]]为图片解析后放到xhsdata的images[]中,如下["path/xxx","path/yyy","path/zzz"]
元数据中:
image路径中去掉/img/或img/;
tags、xhstags、categories格式为[游玩,生活]或者
- 游玩
- 生活
解析为tags[]和categories[]
'''

View File

@@ -0,0 +1,335 @@
#########################################################
## @file : parse_markdown_file.py
## @desc : parse hugo markdown file
## @create : 2025/6/22
## @author : Chengandoubao AI
## @email : douboer@gmail.com
#########################################################
import logging
import json
import re
from pathlib import Path
from collections import defaultdict
# 配置日志
def setup_logger(log_file=None):
logger = logging.getLogger('markdown_parser')
logger.setLevel(logging.INFO)
# 创建控制台处理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
# 创建文件处理器(如果指定了日志文件)
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
logger.addHandler(file_handler)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
if log_file:
file_handler.setFormatter(formatter)
# 添加处理器
logger.addHandler(console_handler)
return logger
class MarkdownParser:
def __init__(self, log_file=None):
self.logger = setup_logger(log_file)
self.logger.info("MarkdownParser initialized")
def parse_markdown_file(self, file_path):
"""解析 Markdown 文件,提取元数据和 XHS 内容"""
xhsdata = defaultdict(dict)
try:
# 验证文件路径
file_path = Path(file_path)
if not file_path.exists():
self.logger.error(f"文件不存在: {file_path}")
raise FileNotFoundError(f"文件不存在: {file_path}")
if not file_path.is_file():
self.logger.error(f"不是有效的文件: {file_path}")
raise ValueError(f"不是有效的文件: {file_path}")
filename = file_path.name
self.logger.info(f"开始解析文件: {filename}")
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 解析 YAML 元数据
metadata = self._parse_metadata(lines)
# 处理标签和分类
self._process_tags_categories(metadata, xhsdata, filename)
# 处理图片路径
self._process_image_path(metadata, xhsdata, filename)
# 添加剩余元数据
for key, value in metadata.items():
if key not in xhsdata[filename]:
xhsdata[filename][key] = value
# 解析内容和图片
self._parse_content_images(lines, xhsdata, filename)
self.logger.info(f"文件解析完成: {filename}")
return xhsdata
except Exception as e:
self.logger.exception(f"解析文件时发生错误: {file_path}")
raise
def _parse_metadata(self, lines):
"""解析 Markdown 文件中的 YAML 元数据"""
metadata = {}
current_key = None
current_list = []
in_front_matter = False
try:
for line in lines:
if line.strip() == '---':
if in_front_matter:
break
in_front_matter = True
continue
if not in_front_matter:
continue
line = line.rstrip('\n') # 去除行尾换行符
# 处理列表项
if line.startswith(' - '):
if current_key:
current_list.append(line[4:].strip())
continue
# 处理键值对
if ':' in line:
# 保存之前的列表项
if current_key and current_list:
metadata[current_key] = current_list
current_list = []
key, *value_parts = line.split(':', 1)
key = key.strip()
value = value_parts[0].strip() if value_parts else ''
# 检查是否为多行值的开始
if value == '':
current_key = key
current_list = []
else:
metadata[key] = value
current_key = None
# 保存最后一个列表项
if current_key and current_list:
metadata[current_key] = current_list
self.logger.debug(f"解析元数据完成: {metadata}")
return metadata
except Exception as e:
self.logger.exception("解析元数据时发生错误")
raise
def _process_tags_categories(self, metadata, xhsdata, filename):
"""处理标签和分类字段"""
try:
for field in ['tags', 'xhstags', 'categories']:
if field in metadata:
value = metadata[field]
if isinstance(value, str):
# 处理 "[标签1,标签2]" 格式
if value.startswith('[') and value.endswith(']'):
value = value[1:-1].replace('"', '').split(',')
value = [tag.strip() for tag in value if tag.strip()]
else:
value = [value]
xhsdata[filename][field] = value
self.logger.debug(f"处理 {field}: {value}")
except Exception as e:
self.logger.exception(f"处理标签/分类时发生错误")
raise
def _process_image_path(self, metadata, xhsdata, filename):
"""处理图片路径,移除/img/或img/前缀"""
try:
if 'image' in metadata:
image_path = metadata['image']
clean_image = re.sub(r'^(/?img/)', '', image_path)
xhsdata[filename]['image'] = clean_image
self.logger.debug(f"处理图片路径: {image_path} -> {clean_image}")
except Exception as e:
self.logger.exception(f"处理图片路径时发生错误")
raise
def _parse_content_images(self, lines, xhsdata, filename):
"""解析内容和图片"""
try:
xhsdata[filename]['content'] = []
xhsdata[filename]['images'] = []
xhsdata[filename]['tables'] = [] # 新增表格存储字段
in_xhs_section = False
current_paragraph = []
in_table = False
table_rows = []
last_line_was_heading = False # 标记上一行是否为标题
for line in lines:
if line.strip() == '<!--xhs-->':
# 遇到新的 <!--xhs--> 标记,处理当前段落并开始新的
if current_paragraph and in_xhs_section:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
in_xhs_section = True
last_line_was_heading = False # 重置标题标记
continue
if in_xhs_section:
# 结束条件:空行或下一个 <!--xhs-->
if line.strip() == '':
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
if in_table:
xhsdata[filename]['tables'].append(table_rows)
table_rows = []
in_table = False
in_xhs_section = False
last_line_was_heading = False # 重置标题标记
continue
# 检查是否为标题行
if line.strip().startswith('#'):
# 如果当前有累积的段落内容先添加到content
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
# 将标题单独添加为一个段落
xhsdata[filename]['content'].append(line.strip())
self.logger.debug(f"添加标题: {line.strip()[:30]}...")
last_line_was_heading = True # 标记上一行是标题
continue
# 如果上一行是标题且当前行没有 <!--xhs--> 标识,则忽略当前行
if last_line_was_heading:
self.logger.debug(f"忽略标题后的内容: {line.strip()[:30]}...")
continue
# 提取图片路径
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
for match in image_matches:
clean_match = re.sub(r'^(/?img/)', '', match)
xhsdata[filename]['images'].append(clean_match)
self.logger.debug(f"提取图片: {clean_match}")
# 去除图片标记后的文本
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
# 检查是否为表格行
if line.strip().startswith('|'):
if not in_table:
in_table = True
row = [cell.strip() for cell in line.strip().strip('|').split('|')]
table_rows.append(row)
else:
if in_table:
xhsdata[filename]['tables'].append(table_rows)
table_rows = []
in_table = False
if text:
current_paragraph.append(text)
last_line_was_heading = False # 重置标题标记
# 处理最后一个段落
if current_paragraph and in_xhs_section:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
self.logger.debug(f"添加最后一个段落: {xhsdata[filename]['content'][-1][:30]}...")
# 处理最后一个表格
if in_table:
xhsdata[filename]['tables'].append(table_rows)
except Exception as e:
self.logger.exception(f"解析内容和图片时发生错误")
raise
def render_to_html(self, xhsdata):
"""将 xhsdata 内容渲染为 HTML"""
html_parts = []
for filename, data in xhsdata.items():
# 添加 xhstitle
if 'xhstitle' in data:
html_parts.append(f'<h1 style="font-size: 24px;">{data["xhstitle"]}</h1>')
# 添加 content
for item in data.get('content', []):
if item.startswith('#'):
level = len(item.split(' ')[0])
title_text = item.replace('#' * level, '').strip()
font_size = 24 - (level - 1) * 2
html_parts.append(f'<h{level} style="font-size: {font_size}px;">{title_text}</h{level}>')
else:
html_parts.append(f'<p style="font-size: 16px;">{item}</p>')
# 添加 tables
for table in data.get('tables', []):
html_parts.append('<table border="1">')
for row in table:
html_parts.append('<tr>')
for cell in row:
html_parts.append(f'<td style="font-size: 14px;">{cell}</td>')
html_parts.append('</tr>')
html_parts.append('</table>')
# 添加空行,让 xhstags 与内容之间空 3 行
for _ in range(3): html_parts.append('<br>')
# 添加 xhstags每个 tag 单独一行
if 'xhstags' in data:
for tag in data['xhstags']:
html_parts.append(f'<span style="font-size: 14px;">#{tag}</span><br>')
# 添加 xhssign
if 'xhssign' in data:
html_parts.append(f'<br><br><br><span style="font-size: 14px;">{data["xhssign"]}</span><br>')
html = '\n'.join(html_parts)
return html
# 示例使用
if __name__ == "__main__":
parser = MarkdownParser(log_file='markdown_parser.log')
try:
file_path = 'markdown/test.md'
result = parser.parse_markdown_file(file_path)
print(json.dumps(result, indent=2, ensure_ascii=False))
html_output = parser.render_to_html(result)
# 以 GBK 编码保存到文件
with open('test.html', 'w', encoding='gbk') as f: f.write(html_output)
except Exception as e:
print(f"程序运行出错: {e}")

View File

@@ -0,0 +1,242 @@
#########################################################
## @file : parse_markdown_file.py
## @desc : parse hugo markdown file
## @create : 2025/6/22
## @author : Chengandoubao AI
## @email : douboer@gmail.com
#########################################################
import logging
import json
from collections import defaultdict
import re
from pathlib import Path
# 配置日志
def setup_logger(log_file=None):
logger = logging.getLogger('markdown_parser')
logger.setLevel(logging.INFO)
# 创建控制台处理器
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
# 创建文件处理器(如果指定了日志文件)
if log_file:
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.DEBUG)
logger.addHandler(file_handler)
# 设置日志格式
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
if log_file:
file_handler.setFormatter(formatter)
# 添加处理器
logger.addHandler(console_handler)
return logger
class MarkdownParser:
def __init__(self, log_file=None):
self.logger = setup_logger(log_file)
self.logger.info("MarkdownParser initialized")
def parse_markdown_file(self, file_path):
"""解析 Markdown 文件,提取元数据和 XHS 内容"""
xhsdata = defaultdict(dict)
try:
# 验证文件路径
file_path = Path(file_path)
if not file_path.exists():
self.logger.error(f"文件不存在: {file_path}")
raise FileNotFoundError(f"文件不存在: {file_path}")
if not file_path.is_file():
self.logger.error(f"不是有效的文件: {file_path}")
raise ValueError(f"不是有效的文件: {file_path}")
filename = file_path.name
self.logger.info(f"开始解析文件: {filename}")
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 解析 YAML 元数据
metadata = self._parse_metadata(lines)
# 处理标签和分类
self._process_tags_categories(metadata, xhsdata, filename)
# 处理图片路径
self._process_image_path(metadata, xhsdata, filename)
# 添加剩余元数据
for key, value in metadata.items():
if key not in xhsdata[filename]:
xhsdata[filename][key] = value
# 解析内容和图片
self._parse_content_images(lines, xhsdata, filename)
self.logger.info(f"文件解析完成: {filename}")
return xhsdata
except Exception as e:
self.logger.exception(f"解析文件时发生错误: {file_path}")
raise
def _parse_metadata(self, lines):
"""解析 Markdown 文件中的 YAML 元数据"""
metadata = {}
current_key = None
current_list = []
in_front_matter = False
try:
for line in lines:
if line.strip() == '---':
if in_front_matter:
break
in_front_matter = True
continue
if not in_front_matter:
continue
line = line.rstrip('\n') # 去除行尾换行符
# 处理列表项
if line.startswith(' - '):
if current_key:
current_list.append(line[4:].strip())
continue
# 处理键值对
if ':' in line:
# 保存之前的列表项
if current_key and current_list:
metadata[current_key] = current_list
current_list = []
key, *value_parts = line.split(':', 1)
key = key.strip()
value = value_parts[0].strip() if value_parts else ''
# 检查是否为多行值的开始
if value == '':
current_key = key
current_list = []
else:
metadata[key] = value
current_key = None
# 保存最后一个列表项
if current_key and current_list:
metadata[current_key] = current_list
self.logger.debug(f"解析元数据完成: {metadata}")
return metadata
except Exception as e:
self.logger.exception("解析元数据时发生错误")
raise
def _process_tags_categories(self, metadata, xhsdata, filename):
"""处理标签和分类字段"""
try:
for field in ['tags', 'xhstags', 'categories']:
if field in metadata:
value = metadata[field]
if isinstance(value, str):
# 处理 "[标签1,标签2]" 格式
if value.startswith('[') and value.endswith(']'):
value = value[1:-1].replace('"', '').split(',')
value = [tag.strip() for tag in value if tag.strip()]
else:
value = [value]
xhsdata[filename][field] = value
self.logger.debug(f"处理 {field}: {value}")
except Exception as e:
self.logger.exception(f"处理标签/分类时发生错误")
raise
def _process_image_path(self, metadata, xhsdata, filename):
"""处理图片路径,移除/img/或img/前缀"""
try:
if 'image' in metadata:
image_path = metadata['image']
clean_image = re.sub(r'^(/?img/)', '', image_path)
xhsdata[filename]['image'] = clean_image
self.logger.debug(f"处理图片路径: {image_path} -> {clean_image}")
except Exception as e:
self.logger.exception(f"处理图片路径时发生错误")
raise
def _parse_content_images(self, lines, xhsdata, filename):
"""解析内容和图片"""
try:
xhsdata[filename]['content'] = []
xhsdata[filename]['images'] = []
in_xhs_section = False
current_paragraph = []
for line in lines:
if line.strip() == '<!--xhs-->':
# 遇到新的 <!--xhs--> 标记,处理当前段落并开始新的
if current_paragraph and in_xhs_section:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
in_xhs_section = True
continue
if in_xhs_section:
# 结束条件:空行或下一个 <!--xhs-->
if line.strip() == '':
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
in_xhs_section = False
continue
# 提取图片路径
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
for match in image_matches:
clean_match = re.sub(r'^(/?img/)', '', match)
xhsdata[filename]['images'].append(clean_match)
self.logger.debug(f"提取图片: {clean_match}")
# 去除图片标记后的文本
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
if text:
current_paragraph.append(text)
# 处理最后一个段落
if current_paragraph and in_xhs_section:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
self.logger.debug(f"添加最后一个段落: {xhsdata[filename]['content'][-1][:30]}...")
except Exception as e:
self.logger.exception(f"解析内容和图片时发生错误")
raise
# 示例使用
if __name__ == "__main__":
parser = MarkdownParser(log_file='markdown_parser.log')
try:
file_path = 'markdown/test.md'
result = parser.parse_markdown_file(file_path)
print(json.dumps(result, indent=2, ensure_ascii=False))
except Exception as e:
print(f"程序运行出错: {e}")

View File

@@ -0,0 +1,141 @@
#########################################################
## @file : parse_markdown_file.py
## @desc : parse hugo markdown file
## @create : 2025/6/22
## @author : Chengandoubao AI
## @email : douboer@gmail.com
#########################################################
from collections import defaultdict
import re
import json
def parse_markdown_file(file_path):
xhsdata = defaultdict(dict)
filename = file_path.split('/')[-1]
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 解析 YAML 元数据
metadata = {}
current_key = None
current_list = []
in_front_matter = False
for line in lines:
if line.strip() == '---':
if in_front_matter:
break
in_front_matter = True
continue
if not in_front_matter:
continue
line = line.rstrip('\n') # 去除行尾换行符
# 处理列表项
if line.startswith(' - '):
if current_key:
current_list.append(line[4:].strip())
continue
# 处理键值对
if ':' in line:
# 保存之前的列表项
if current_key and current_list:
metadata[current_key] = current_list
current_list = []
key, *value_parts = line.split(':', 1)
key = key.strip()
value = value_parts[0].strip() if value_parts else ''
# 检查是否为多行值的开始
if value == '':
current_key = key
current_list = []
else:
metadata[key] = value
current_key = None
# 保存最后一个列表项
if current_key and current_list:
metadata[current_key] = current_list
# 处理标签和分类
for field in ['tags', 'xhstags', 'categories']:
if field in metadata:
value = metadata[field]
if isinstance(value, str):
# 处理 "[标签1,标签2]" 格式
if value.startswith('[') and value.endswith(']'):
value = value[1:-1].replace('"', '').split(',')
value = [tag.strip() for tag in value if tag.strip()]
else:
value = [value]
xhsdata[filename][field] = value
# 处理图片路径(重点优化)
if 'image' in metadata:
image_path = metadata['image']
# 移除开头的 /img/ 或 img/,保留后续路径
clean_image = re.sub(r'^(/?img/)', '', image_path)
xhsdata[filename]['image'] = clean_image
# 将剩余元数据添加到结果中
for key, value in metadata.items():
if key not in xhsdata[filename]:
xhsdata[filename][key] = value
# 解析内容和图片
xhsdata[filename]['content'] = []
xhsdata[filename]['images'] = []
in_xhs_section = False
current_paragraph = []
for line in lines:
if line.strip() == '<!--xhs-->':
# 遇到新的 <!--xhs--> 标记,处理当前段落并开始新的
if current_paragraph and in_xhs_section:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
in_xhs_section = True
continue
'''
if line.strip() == '<!--xhs-->':
in_xhs_section = True
continue
'''
if in_xhs_section:
if line.strip() == '':
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
current_paragraph = []
in_xhs_section = False
else:
# 提取图片路径
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
for match in image_matches:
clean_match = re.sub(r'^(/|img/)', '', match)
xhsdata[filename]['images'].append(clean_match)
# 去除图片标记后的文本
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
if text:
current_paragraph.append(text)
# 处理最后一个段落
if current_paragraph:
xhsdata[filename]['content'].append(' '.join(current_paragraph))
return xhsdata
# 修改文件路径
file_path = 'markdown/test.md'
result = parse_markdown_file(file_path)
print(json.dumps(result,indent=2, ensure_ascii=False))

237
backup/pretty_html_table.py Normal file
View File

@@ -0,0 +1,237 @@
import io
# Reformat table_color as dict of tuples
dict_colors = {
'yellow_light' : ('#BF8F00', '2px solid #BF8F00', '#FFF2CC', '#FFFFFF'),
'grey_light' : ('#808080', '2px solid #808080', '#EDEDED', '#FFFFFF'),
'blue_light' : ('#305496', '2px solid #305496', '#D9E1F2', '#FFFFFF'),
'orange_light' : ('#C65911', '2px solid #C65911', '#FCE4D6', '#FFFFFF'),
'green_light' : ('#548235', '2px solid #548235', '#E2EFDA', '#FFFFFF'),
'red_light' : ('#823535', '2px solid #823535', '#efdada', '#FFFFFF'),
'yellow_dark' : ('#FFFFFF', '2px solid #BF8F00', '#FFF2CC', '#BF8F00'),
'grey_dark' : ('#FFFFFF', '2px solid #808080', '#EDEDED', '#808080'),
'blue_dark': ('#FFFFFF', '2px solid #305496', '#D9E1F2', '#305496'),
'orange_dark' : ('#FFFFFF', '2px solid #C65911', '#FCE4D6', '#C65911'),
'green_dark' : ('#FFFFFF', '2px solid #548235', '#E2EFDA', '#548235'),
'red_dark' : ('#FFFFFF', '2px solid #823535', '#efdada', '#823535')
}
def build_table(
df,
color,
font_size='medium',
font_family='Century Gothic, sans-serif',
text_align='left',
width='auto',
index=False,
even_color='black',
even_bg_color='white',
odd_bg_color=None,
border_bottom_color=None,
escape=True,
width_dict=[],
padding="0px 20px 0px 0px",
float_format=None,
conditions={}):
if df.empty:
return ''
# Set color
color, border_bottom, odd_background_color, header_background_color = dict_colors[color]
if odd_bg_color:
odd_background_color = odd_bg_color
if border_bottom_color:
border_bottom = border_bottom_color
a = 0
while a != len(df):
if a == 0:
df_html_output = df.iloc[[a]].to_html(
na_rep="",
index=index,
border=0,
escape=escape,
float_format=float_format,
)
# change format of header
if index:
df_html_output = df_html_output.replace('<th>'
,'<th style = "background-color: ' + header_background_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';color: ' + color
+ ';text-align: ' + text_align
+ ';border-bottom: ' + border_bottom
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">', len(df.columns)+1)
df_html_output = df_html_output.replace('<th>'
,'<th style = "background-color: ' + odd_background_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';text-align: ' + text_align
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
else:
df_html_output = df_html_output.replace('<th>'
,'<th style = "background-color: ' + header_background_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';color: ' + color
+ ';text-align: ' + text_align
+ ';border-bottom: ' + border_bottom
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
#change format of table
df_html_output = df_html_output.replace('<td>'
,'<td style = "background-color: ' + odd_background_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';text-align: ' + text_align
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
body = """<p>""" + format(df_html_output)
a = 1
elif a % 2 == 0:
df_html_output = df.iloc[[a]].to_html(na_rep = "", index = index, header = False, escape=escape)
# change format of index
df_html_output = df_html_output.replace('<th>'
,'<th style = "background-color: ' + odd_background_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';text-align: ' + text_align
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
#change format of table
df_html_output = df_html_output.replace('<td>'
,'<td style = "background-color: ' + odd_background_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';text-align: ' + text_align
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
body = body + format(df_html_output)
a += 1
elif a % 2 != 0:
df_html_output = df.iloc[[a]].to_html(na_rep = "", index = index, header = False, escape=escape)
# change format of index
df_html_output = df_html_output.replace('<th>'
,'<th style = "background-color: ' + even_bg_color
+ '; color: ' + even_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';text-align: ' + text_align
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
#change format of table
df_html_output = df_html_output.replace('<td>'
,'<td style = "background-color: ' + even_bg_color
+ '; color: ' + even_color
+ ';font-family: ' + font_family
+ ';font-size: ' + str(font_size)
+ ';text-align: ' + text_align
+ ';padding: ' + padding
+ ';width: ' + str(width) + '">')
body = body + format(df_html_output)
a += 1
body = body + """</p>"""
body = body.replace("""</td>
</tr>
</tbody>
</table>
<table border="1" class="dataframe">
<tbody>
<tr>""","""</td>
</tr>
<tr>""").replace("""</td>
</tr>
</tbody>
</table><table border="1" class="dataframe">
<tbody>
<tr>""","""</td>
</tr>
<tr>""")
if conditions:
for k in conditions.keys():
try:
conditions[k]['index'] = list(df.columns).index(k)
width_body = ''
w = 0
for line in io.StringIO(body):
updated_body = False
if w == conditions[k]['index']:
try:
if int(repr(line).split('>')[1].split('<')[0]) < conditions[k]['min']:
if 'color: black' in repr(line):
width_body = width_body + repr(line).replace("color: black", 'color: ' + conditions[k]['min_color'])[1:]
elif 'color: white' in repr(line):
width_body = width_body + repr(line).replace("color: white", 'color: ' + conditions[k]['min_color'])[1:]
else:
width_body = width_body + repr(line).replace('">', '; color: ' + conditions[k]['min_color'] + '">')[1:]
updated_body = True
elif int(repr(line).split('>')[1].split('<')[0]) > conditions[k]['max']:
if 'color: black' in repr(line):
width_body = width_body + repr(line).replace("color: black", 'color: ' + conditions[k]['max_color'])[1:]
elif 'color: white' in repr(line):
width_body = width_body + repr(line).replace("color: white", 'color: ' + conditions[k]['max_color'])[1:]
else:
width_body = width_body + repr(line).replace('">', '; color: ' + conditions[k]['max_color'] + '">')[1:]
updated_body = True
except:
pass
if not updated_body:
width_body = width_body + repr(line)[1:]
if str(repr(line))[:10] == "' <td" or str(repr(line))[:10] == "' <th":
if w == len(df.columns) -1:
w = 0
else:
w += 1
body = width_body[:len(width_body)-1]
except:
pass
if len(width_dict) == len(df.columns):
width_body = ''
w = 0
if conditions:
for line in body.split(r"\n'"):
line = line.replace("\n", "")
width_body = width_body + repr(line).replace("width: auto", 'width: ' + width_dict[w])[1:]
if str(repr(line))[:10] == "' <td" or str(repr(line))[:10] == "' <th" :
if w == len(df.columns) -1:
w = 0
else:
w += 1
else:
for line in io.StringIO(body):
line = line.replace("\n", "")
width_body = width_body + repr(line).replace("width: auto", 'width: ' + width_dict[w])[1:]
if str(repr(line))[:10] == "' <td" or str(repr(line))[:10] == "' <th" :
if w == len(df.columns) -1:
w = 0
else:
w += 1
return width_body[:len(width_body)-1].replace("'", "")
else:
return body.replace(r"\n'", "")

43
backup/push_files.py Normal file
View File

@@ -0,0 +1,43 @@
import subprocess
def get_uncommitted_changes():
"""获取所有未提交的修改,并区分新增、删除、修改状态"""
changes = {
'added': [],
'deleted': [],
'modified': []
}
try:
# 获取工作区和暂存区的完整状态
status_output = subprocess.check_output(
["git", "status", "--porcelain"],
text=True,
stderr=subprocess.DEVNULL
).splitlines()
for line in status_output:
status_code = line[:2].strip()
file_path = line[3:].strip()
if status_code == '??':
changes['added'].append(file_path)
elif status_code == 'D':
changes['deleted'].append(file_path)
elif status_code in ('M', 'AM', 'MD'):
changes['modified'].append(file_path)
except subprocess.CalledProcessError:
pass
return changes
# 使用示例
changes = get_uncommitted_changes()
for status, files in changes.items():
if files:
print(f"\n{status.upper()}:")
for file in files:
print(f" - {file}")

326
backup/txt_to_image.py Normal file
View File

@@ -0,0 +1,326 @@
#########################################################
## @file : txt_to_image.py
## @desc : text content convert to image
## @create : 2025/6/22
## @author : Chengandoubao AI
## @email : douboer@gmail.com
#########################################################
from logger_utils import CommonLogger
from PIL import Image, ImageDraw, ImageFont, ImageFilter
import textwrap
import random
import os
from config import fonts_path, templates, styles
class TextToImage:
"""将文字内容转换为美观图片的类"""
def __init__(self, log_file=None):
"""初始化字体和模板路径"""
self.logger = CommonLogger(log_file).get_logger()
self.fonts_path = fonts_path
self.templates = templates
self.target_size = (1244, 1660) # 目标尺寸1244x1660
def create_image(self, title, content, subtitle=None, signature=None,
template='minimal',
output_path='output.png'):
"""创建文字图片"""
try:
# 确保模板存在
if template not in self.templates:
raise ValueError(f"模板 '{template}' 不存在")
template_data = self.templates[template]
# 处理背景图片尺寸
background = self._process_background(template_data['background'])
draw = ImageDraw.Draw(background)
width, height = background.size
padding = template_data['padding']
# 从config获取styles
title_style = styles['title']
subtitle_style = styles['subtitle']
body_style = styles['body']
signature_style = styles['signature']
top_offset = styles['top_offset']
bottom_offset = styles['bottom_offset']
paragraph_spacing = styles['paragraph_spacing'] # 获取段间距
# 检查字体文件是否存在
for key, path in self.fonts_path.items():
if not os.path.exists(path):
raise FileNotFoundError(f"字体文件 '{path}' 不存在请确认已放入fonts目录")
# 加载字体
title_font = ImageFont.truetype(self.fonts_path['title'], title_style['size'])
subtitle_font = ImageFont.truetype(self.fonts_path['subtitle'], subtitle_style['size'])
body_font = ImageFont.truetype(self.fonts_path['body'], body_style['size'])
signature_font = ImageFont.truetype(self.fonts_path['signature'], signature_style['size'])
# 绘制标题
title_x = (width - self._get_text_width(title, title_font, title_style['letter_spacing'])) // 2
title_y = padding + title_style['top_spacing'] # 使用 top_spacing
self._draw_text_with_spacing(draw, (title_x, title_y), title,
title_font, title_style['color'], title_style['letter_spacing'])
# 计算标题高度
title_bbox = draw.textbbox((0, 0), title, font=title_font)
title_height = title_bbox[3] - title_bbox[1]
# 绘制副标题
current_y = title_y + title_height + title_style['bottom_spacing']
if subtitle:
subtitle_x = (width - self._get_text_width(subtitle, subtitle_font, subtitle_style['letter_spacing'])) // 2
self._draw_text_with_spacing(draw, (subtitle_x, current_y), subtitle,
subtitle_font, subtitle_style['color'], subtitle_style['letter_spacing'])
subtitle_bbox = draw.textbbox((0, 0), subtitle, font=subtitle_font)
subtitle_height = subtitle_bbox[3] - subtitle_bbox[1]
current_y += subtitle_height + subtitle_style['bottom_spacing']
# 分割内容为段落
paragraphs = content.split('\n\n')
page_num = 1
output_paths = []
for i, paragraph in enumerate(paragraphs):
if i > 0: # 如果不是第一个段落,添加段间距
current_y += paragraph_spacing
if current_y + self._get_paragraph_height(paragraph, body_font, body_style) > height - bottom_offset:
# 绘制签名
if signature:
self._draw_signature(draw, width, height, padding, signature, signature_font, signature_style)
# 保存当前图片
output_path_current = output_path.replace('.png', f'_{page_num}.png')
background.save(output_path_current)
self.logger.info(f"图片已保存至: {output_path_current}")
output_paths.append(output_path_current)
# 创建新的图片
background = self._process_background(template_data['background'])
draw = ImageDraw.Draw(background)
current_y = top_offset
page_num += 1
# 绘制段落
max_width = width - padding * 2
wrapped_text = self._wrap_text(paragraph, body_font, max_width, body_style['letter_spacing'])
self._draw_multiline_text_with_spacing(
draw, (padding, current_y), wrapped_text,
body_font, body_style['color'],
body_style['letter_spacing'], body_style['line_spacing']
)
# 计算段落高度
text_bbox = draw.textbbox((0, 0), wrapped_text, font=body_font)
text_height = (text_bbox[3] - text_bbox[1]) + (body_style['line_spacing'] * (wrapped_text.count('\n') or 1))
current_y += text_height + body_style['line_spacing']
# 绘制签名
if signature:
self._draw_signature(draw, width, height, padding, signature, signature_font, signature_style)
# 保存最后一张图片
output_path_current = output_path.replace('.png', f'_{page_num}.png')
background.save(output_path_current)
self.logger.info(f"图片已保存至: {output_path_current}")
output_paths.append(output_path_current)
return output_paths
except Exception as e:
self.logger.exception(f"创建图片时发生错误: {e}")
return None
def _get_text_width(self, text, font, letter_spacing):
"""计算包含字间距的文本总宽度"""
if not text:
return 0
bbox = font.getbbox(text)
return bbox[2] - bbox[0] + (letter_spacing * (len(text) - 1))
def _draw_text_with_spacing(self, draw, position, text, font, fill, letter_spacing):
"""绘制包含字间距的文本"""
x, y = position
for char in text:
draw.text((x, y), char, font=font, fill=fill)
# 获取字符宽度并加上字间距
char_width = font.getlength(char)
x += char_width + letter_spacing
def _draw_multiline_text_with_spacing(self, draw, position, text, font, fill,
letter_spacing, line_spacing):
"""绘制包含字间距和行间距的多行文本"""
x, y = position
lines = text.split('\n')
for line in lines:
self._draw_text_with_spacing(draw, (x, y), line, font, fill, letter_spacing)
# 获取行高并加上行间距
line_bbox = draw.textbbox((0, 0), line, font=font)
line_height = line_bbox[3] - line_bbox[1]
y += line_height + line_spacing
def _add_decorations(self, draw, width, height, template_data):
"""添加装饰元素"""
padding = template_data['padding']
color = (30, 30, 30) # 默认装饰颜色
# 顶部和底部添加细线条
#draw.line([(padding, padding//2), (width-padding, padding//2)], fill=color, width=1)
#draw.line([(padding, height-padding//2), (width-padding, height-padding//2)], fill=color, width=1)
# 添加几何装饰点
for _ in range(8):
x = random.randint(padding, width-padding)
y = random.randint(padding, height-padding)
draw.ellipse([(x-2, y-2), (x+2, y+2)], fill=color)
def _wrap_text(self, text, font, max_width, letter_spacing):
"""根据最大宽度和字间距智能换行"""
if not text:
return ""
avg_char_width = font.getlength('a') + letter_spacing
max_chars_per_line = int(max_width // avg_char_width)
wrapped_text = ""
lines = text.split('\n')
for line in lines:
if not line:
wrapped_text += "\n\n"
continue
words = list(line)
current_line = ""
current_width = 0
for word in words:
word_width = font.getlength(word) + letter_spacing
if current_width + word_width > max_width:
if current_line:
wrapped_text += current_line + "\n"
current_line = word
current_width = word_width
else:
wrapped_text += word + "\n"
current_line = ""
current_width = 0
else:
current_line += word
current_width += word_width
if current_line:
wrapped_text += current_line + "\n"
return wrapped_text.strip()
def _process_background(self, background_path):
"""处理背景图片将宽度调整为1440并等比例缩放高度然后裁剪或扩展为目标尺寸1244x1660"""
target_width, target_height = self.target_size
try:
# 打开背景图片
background = Image.open(background_path).convert("RGBA")
bg_width, bg_height = background.size
# 调整宽度为1440并等比例缩放高度
if bg_width != 1440:
ratio = 1440 / bg_width
new_height = int(bg_height * ratio)
background = background.resize((1440, new_height), Image.LANCZOS)
bg_width, bg_height = background.size
# 如果图片尺寸大于目标尺寸,进行裁剪
if bg_width >= target_width and bg_height >= target_height:
# 计算裁剪区域(居中裁剪)
left = (bg_width - target_width) // 2
top = (bg_height - target_height) // 2
right = left + target_width
bottom = top + target_height
background = background.crop((left, top, right, bottom))
# 如果图片尺寸小于目标尺寸,进行扩展(白色填充)
else:
new_background = Image.new("RGBA", self.target_size, (255, 255, 255, 255))
# 计算粘贴位置(居中)
paste_x = (target_width - bg_width) // 2
paste_y = (target_height - bg_height) // 2
new_background.paste(background, (paste_x, paste_y))
background = new_background
except Exception as e:
# 如果背景图片处理失败,创建默认尺寸的白色背景
self.logger.exception(f"背景图片处理失败: {e}")
background = Image.new("RGBA", self.target_size, (255, 255, 255, 255))
return background
def _get_paragraph_height(self, paragraph, font, style):
"""计算段落的高度"""
max_width = self.target_size[0] - 2 * self.templates['minimal']['padding']
wrapped_text = self._wrap_text(paragraph, font, max_width, style['letter_spacing'])
text_bbox = ImageDraw.Draw(Image.new("RGBA", self.target_size)).textbbox((0, 0), wrapped_text, font=font)
text_height = (text_bbox[3] - text_bbox[1]) + (style['line_spacing'] * (wrapped_text.count('\n') or 1))
return text_height
def _draw_signature(self, draw, width, height, padding, signature, signature_font, signature_style):
"""绘制签名"""
signature_width = self._get_text_width(signature, signature_font, signature_style['letter_spacing'])
signature_bbox = draw.textbbox((0, 0), signature, font=signature_font)
signature_height = signature_bbox[3] - signature_bbox[1]
# 根据配置确定签名垂直位置
if signature_style['vertical_position'] == 'bottom':
# 固定在底部
signature_y = height - padding - signature_height - signature_style['offset']
else:
# 跟随内容流动(原逻辑)
signature_y = current_y + 40
# 根据配置确定签名水平位置
if signature_style['position'] == 'left':
signature_x = padding
elif signature_style['position'] == 'center':
signature_x = (width - signature_width) // 2
else: # 'right'
signature_x = width - padding - signature_width
self._draw_text_with_spacing(draw, (signature_x, signature_y), signature,
signature_font, signature_style['color'], signature_style['letter_spacing'])
if __name__ == "__main__":
# 示例内容
title = "夏日阅读清单"
subtitle = "2025年必读书单推荐"
content = """日本寺院有抽签的风尚,大概也是收入不菲的创收项目吧。一个个签位,标了价格,无人售票,这种方式挺好,没有买卖的功利和压力。
不免俗,抽了支,下下,中间有句,大意是人财分离。虽不作兴这个,还是略有不悦。
逛完突然想着去吃抹茶小点,老杨发现转角处就有一家抹茶专业店,可惜刚打烊。边上小店点了一个撒着浅草字样的冰激凌,无惊喜。
途经一家电玩店别有洞天一排排整齐划一机器挨着机器每一排机器造型不同应该超过500台空间被利用到极致。充斥着机器电音令人想起工业朋
克。一簇簇坐着的玩家电子烟是标配60%是中老年人,秃头大爷和白发老奶不少见,并无少年,颠覆我对游戏厅的印象。
这里机器是掌控者,人是机器的有机体延伸,机器设定规则,发出吼叫,刺激神经,填满有机体空虚孤独的心。比起从站台跳下去,被机器奴役也并不算太差,这样想来,风月场也好,游戏厅也好,经营者都有功德。"""
signature = "@刀波儿"
# 确保目录存在
os.makedirs('fonts', exist_ok=True)
os.makedirs('backgrounds', exist_ok=True)
# 创建简单背景(如果模板不存在)
if not os.path.exists('backgrounds/minimal.jpg'):
img = Image.new('RGB', (1244, 1660), color=(240, 240, 240))
draw = ImageDraw.Draw(img)
for i in range(0, 1660, 20):
draw.line([(0, i), (1244, i)], fill=(230, 230, 230))
img.save('backgrounds/minimal.jpg')
# 生成图片
converter = TextToImage(log_file='logs/txt_to_image.log')
output_paths = converter.create_image(
title=title,
subtitle=subtitle,
content=content,
signature=signature,
template='minimal',
output_path='temp/reading_list.png'
)
print("生成的图片路径:", output_paths)