Initial commit
This commit is contained in:
60
backup/config.py
Normal file
60
backup/config.py
Normal file
@@ -0,0 +1,60 @@
|
||||
# config.py
|
||||
|
||||
#定义内容字体
|
||||
fonts_path = {
|
||||
'title': 'fonts/LXGWWenKai-Medium.TTF',
|
||||
'subtitle': 'fonts/LXGWWenKai-Medium.TTF',
|
||||
'body': 'fonts/LXGWWenKai-Light.TTF',
|
||||
'signature': 'fonts/LXGWWenKai-Light.TTF'
|
||||
}
|
||||
|
||||
|
||||
#定义底图模版
|
||||
templates = {
|
||||
'minimal': {
|
||||
'background': 'backgrounds/IMG_5784.JPG',
|
||||
'padding': 50
|
||||
},
|
||||
'modern': {
|
||||
'background': 'backgrounds/IMG_5789.JPG',
|
||||
'padding': 60
|
||||
},
|
||||
'vintage': {
|
||||
'background': 'backgrounds/IMG_5793.JPG',
|
||||
'padding': 70
|
||||
}
|
||||
}
|
||||
|
||||
#定义各部分内容样式
|
||||
styles = {
|
||||
'title': {
|
||||
'size': 48,
|
||||
'letter_spacing': 0,
|
||||
'color': (30, 30, 30),
|
||||
'line_spacing': 10,
|
||||
'top_spacing': 0,
|
||||
'bottom_spacing': 20
|
||||
},
|
||||
'subtitle': {
|
||||
'size': 32,
|
||||
'letter_spacing': 0,
|
||||
'color': (60, 60, 60),
|
||||
'line_spacing': 10,
|
||||
'top_spacing': 20,
|
||||
'bottom_spacing': 20
|
||||
},
|
||||
'body': {
|
||||
'size': 24,
|
||||
'letter_spacing': 0,
|
||||
'color': (20, 20, 20),
|
||||
'line_spacing': 15
|
||||
},
|
||||
'signature': {
|
||||
'size': 28,
|
||||
'letter_spacing': 2,
|
||||
'color': (80, 80, 80),
|
||||
'position': 'right', # 水平位置:left, center, right
|
||||
'vertical_position': 'bottom', # 垂直位置:bottom, flow
|
||||
'offset': 30 # 距离底部的偏移量
|
||||
}
|
||||
}
|
||||
31
backup/logger_utils.py
Normal file
31
backup/logger_utils.py
Normal file
@@ -0,0 +1,31 @@
|
||||
|
||||
import logging
|
||||
|
||||
# 公共的日志类
|
||||
class CommonLogger:
|
||||
def __init__(self, log_file=None):
|
||||
self.logger = logging.getLogger(__name__)
|
||||
self.logger.setLevel(logging.INFO)
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
|
||||
# 创建文件处理器(如果指定了日志文件)
|
||||
if log_file:
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
self.logger.addHandler(file_handler)
|
||||
|
||||
# 设置日志格式
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
console_handler.setFormatter(formatter)
|
||||
if log_file:
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# 添加处理器
|
||||
self.logger.addHandler(console_handler)
|
||||
|
||||
def get_logger(self):
|
||||
return self.logger
|
||||
|
||||
154
backup/parse_markdown_file.ai.py
Normal file
154
backup/parse_markdown_file.ai.py
Normal file
@@ -0,0 +1,154 @@
|
||||
|
||||
|
||||
|
||||
from collections import defaultdict
|
||||
import re
|
||||
import json
|
||||
|
||||
def parse_markdown_file(file_path):
|
||||
xhsdata = defaultdict(dict)
|
||||
filename = file_path.split('/')[-1]
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# 解析 YAML 元数据
|
||||
metadata = {}
|
||||
current_key = None
|
||||
current_list = []
|
||||
in_front_matter = False
|
||||
|
||||
for line in lines:
|
||||
if line.strip() == '---':
|
||||
if in_front_matter:
|
||||
break
|
||||
in_front_matter = True
|
||||
continue
|
||||
if not in_front_matter:
|
||||
continue
|
||||
|
||||
line = line.rstrip('\n') # 去除行尾换行符
|
||||
|
||||
# 处理列表项
|
||||
if line.startswith(' - '):
|
||||
if current_key:
|
||||
current_list.append(line[4:].strip())
|
||||
continue
|
||||
|
||||
# 处理键值对
|
||||
if ':' in line:
|
||||
# 保存之前的列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
current_list = []
|
||||
|
||||
key, *value_parts = line.split(':', 1)
|
||||
key = key.strip()
|
||||
value = value_parts[0].strip() if value_parts else ''
|
||||
|
||||
# 检查是否为多行值的开始
|
||||
if value == '':
|
||||
current_key = key
|
||||
current_list = []
|
||||
else:
|
||||
metadata[key] = value
|
||||
current_key = None
|
||||
|
||||
# 保存最后一个列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
|
||||
# 处理标签和分类
|
||||
for field in ['tags', 'xhstags', 'categories']:
|
||||
if field in metadata:
|
||||
value = metadata[field]
|
||||
if isinstance(value, str):
|
||||
# 处理 "[标签1,标签2]" 格式
|
||||
if value.startswith('[') and value.endswith(']'):
|
||||
value = value[1:-1].replace('"', '').split(',')
|
||||
value = [tag.strip() for tag in value if tag.strip()]
|
||||
else:
|
||||
value = [value]
|
||||
xhsdata[filename][field] = value
|
||||
|
||||
# 处理图片路径(重点优化)
|
||||
if 'image' in metadata:
|
||||
image_path = metadata['image']
|
||||
# 移除开头的 /img/ 或 img/,保留后续路径
|
||||
clean_image = re.sub(r'^(/?img/)', '', image_path)
|
||||
xhsdata[filename]['image'] = clean_image
|
||||
|
||||
# 将剩余元数据添加到结果中
|
||||
for key, value in metadata.items():
|
||||
if key not in xhsdata[filename]:
|
||||
xhsdata[filename][key] = value
|
||||
|
||||
# 解析内容和图片
|
||||
xhsdata[filename]['content'] = []
|
||||
xhsdata[filename]['images'] = []
|
||||
in_xhs_section = False
|
||||
current_paragraph = []
|
||||
|
||||
for line in lines:
|
||||
if line.strip() == '<!--xhs-->':
|
||||
in_xhs_section = True
|
||||
continue
|
||||
if in_xhs_section:
|
||||
if line.strip() == '':
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
in_xhs_section = False
|
||||
else:
|
||||
# 提取图片路径
|
||||
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
|
||||
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
|
||||
for match in image_matches:
|
||||
clean_match = re.sub(r'^(/|img/)', '', match)
|
||||
xhsdata[filename]['images'].append(clean_match)
|
||||
|
||||
# 去除图片标记后的文本
|
||||
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
|
||||
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
|
||||
if text:
|
||||
current_paragraph.append(text)
|
||||
|
||||
# 处理最后一个段落
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
|
||||
return xhsdata
|
||||
|
||||
# 修改文件路径
|
||||
file_path = 'markdown/test.md'
|
||||
result = parse_markdown_file(file_path)
|
||||
print(json.dumps(result,indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
''' 豆包自动生成,提示语:
|
||||
用python实现,解析markdown文件,如附件所示,要求:
|
||||
1. 解析的内容放到defaultdict数据结构中:
|
||||
xhsdata =
|
||||
{
|
||||
"filename": {
|
||||
"title": "Labubu爆火现象",
|
||||
"date": "2025-06-19 11:00",
|
||||
"tags": ["潮玩","labubu"……],
|
||||
"where": "杭州市西湖风景名胜区",
|
||||
"open": "yes",
|
||||
"content": ["paragraph1","paragraph2","paragraph3"……],
|
||||
"images":["image1","image2","image3"……],
|
||||
……
|
||||
}
|
||||
}
|
||||
2. 只解析标识<!--xhs-->的段落(直到空行)。
|
||||
3. 文字内容按段放到xhsdata的content[]中的元素里。
|
||||
4. ,,![[path/zzz]]为图片,解析后放到xhsdata的images[]中,如下["path/xxx","path/yyy","path/zzz"]
|
||||
|
||||
元数据中:
|
||||
image路径中去掉/img/或img/;
|
||||
tags、xhstags、categories格式为[游玩,生活]或者
|
||||
- 游玩
|
||||
- 生活
|
||||
解析为tags[]和categories[]
|
||||
'''
|
||||
335
backup/parse_markdown_file.py
Normal file
335
backup/parse_markdown_file.py
Normal file
@@ -0,0 +1,335 @@
|
||||
|
||||
#########################################################
|
||||
## @file : parse_markdown_file.py
|
||||
## @desc : parse hugo markdown file
|
||||
## @create : 2025/6/22
|
||||
## @author : Chengan,doubao AI
|
||||
## @email : douboer@gmail.com
|
||||
#########################################################
|
||||
|
||||
import logging
|
||||
import json
|
||||
import re
|
||||
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
# 配置日志
|
||||
def setup_logger(log_file=None):
|
||||
logger = logging.getLogger('markdown_parser')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
|
||||
# 创建文件处理器(如果指定了日志文件)
|
||||
if log_file:
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 设置日志格式
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
console_handler.setFormatter(formatter)
|
||||
if log_file:
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# 添加处理器
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
class MarkdownParser:
|
||||
def __init__(self, log_file=None):
|
||||
self.logger = setup_logger(log_file)
|
||||
self.logger.info("MarkdownParser initialized")
|
||||
|
||||
def parse_markdown_file(self, file_path):
|
||||
"""解析 Markdown 文件,提取元数据和 XHS 内容"""
|
||||
xhsdata = defaultdict(dict)
|
||||
|
||||
try:
|
||||
# 验证文件路径
|
||||
file_path = Path(file_path)
|
||||
if not file_path.exists():
|
||||
self.logger.error(f"文件不存在: {file_path}")
|
||||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||
|
||||
if not file_path.is_file():
|
||||
self.logger.error(f"不是有效的文件: {file_path}")
|
||||
raise ValueError(f"不是有效的文件: {file_path}")
|
||||
|
||||
filename = file_path.name
|
||||
self.logger.info(f"开始解析文件: {filename}")
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# 解析 YAML 元数据
|
||||
metadata = self._parse_metadata(lines)
|
||||
|
||||
# 处理标签和分类
|
||||
self._process_tags_categories(metadata, xhsdata, filename)
|
||||
|
||||
# 处理图片路径
|
||||
self._process_image_path(metadata, xhsdata, filename)
|
||||
|
||||
# 添加剩余元数据
|
||||
for key, value in metadata.items():
|
||||
if key not in xhsdata[filename]:
|
||||
xhsdata[filename][key] = value
|
||||
|
||||
# 解析内容和图片
|
||||
self._parse_content_images(lines, xhsdata, filename)
|
||||
|
||||
self.logger.info(f"文件解析完成: {filename}")
|
||||
return xhsdata
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"解析文件时发生错误: {file_path}")
|
||||
raise
|
||||
|
||||
def _parse_metadata(self, lines):
|
||||
"""解析 Markdown 文件中的 YAML 元数据"""
|
||||
metadata = {}
|
||||
current_key = None
|
||||
current_list = []
|
||||
in_front_matter = False
|
||||
|
||||
try:
|
||||
for line in lines:
|
||||
if line.strip() == '---':
|
||||
if in_front_matter:
|
||||
break
|
||||
in_front_matter = True
|
||||
continue
|
||||
if not in_front_matter:
|
||||
continue
|
||||
|
||||
line = line.rstrip('\n') # 去除行尾换行符
|
||||
|
||||
# 处理列表项
|
||||
if line.startswith(' - '):
|
||||
if current_key:
|
||||
current_list.append(line[4:].strip())
|
||||
continue
|
||||
|
||||
# 处理键值对
|
||||
if ':' in line:
|
||||
# 保存之前的列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
current_list = []
|
||||
|
||||
key, *value_parts = line.split(':', 1)
|
||||
key = key.strip()
|
||||
value = value_parts[0].strip() if value_parts else ''
|
||||
|
||||
# 检查是否为多行值的开始
|
||||
if value == '':
|
||||
current_key = key
|
||||
current_list = []
|
||||
else:
|
||||
metadata[key] = value
|
||||
current_key = None
|
||||
|
||||
# 保存最后一个列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
|
||||
self.logger.debug(f"解析元数据完成: {metadata}")
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception("解析元数据时发生错误")
|
||||
raise
|
||||
|
||||
def _process_tags_categories(self, metadata, xhsdata, filename):
|
||||
"""处理标签和分类字段"""
|
||||
try:
|
||||
for field in ['tags', 'xhstags', 'categories']:
|
||||
if field in metadata:
|
||||
value = metadata[field]
|
||||
if isinstance(value, str):
|
||||
# 处理 "[标签1,标签2]" 格式
|
||||
if value.startswith('[') and value.endswith(']'):
|
||||
value = value[1:-1].replace('"', '').split(',')
|
||||
value = [tag.strip() for tag in value if tag.strip()]
|
||||
else:
|
||||
value = [value]
|
||||
xhsdata[filename][field] = value
|
||||
self.logger.debug(f"处理 {field}: {value}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"处理标签/分类时发生错误")
|
||||
raise
|
||||
|
||||
def _process_image_path(self, metadata, xhsdata, filename):
|
||||
"""处理图片路径,移除/img/或img/前缀"""
|
||||
try:
|
||||
if 'image' in metadata:
|
||||
image_path = metadata['image']
|
||||
clean_image = re.sub(r'^(/?img/)', '', image_path)
|
||||
xhsdata[filename]['image'] = clean_image
|
||||
self.logger.debug(f"处理图片路径: {image_path} -> {clean_image}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"处理图片路径时发生错误")
|
||||
raise
|
||||
|
||||
def _parse_content_images(self, lines, xhsdata, filename):
|
||||
"""解析内容和图片"""
|
||||
try:
|
||||
xhsdata[filename]['content'] = []
|
||||
xhsdata[filename]['images'] = []
|
||||
xhsdata[filename]['tables'] = [] # 新增表格存储字段
|
||||
in_xhs_section = False
|
||||
current_paragraph = []
|
||||
in_table = False
|
||||
table_rows = []
|
||||
last_line_was_heading = False # 标记上一行是否为标题
|
||||
|
||||
for line in lines:
|
||||
if line.strip() == '<!--xhs-->':
|
||||
# 遇到新的 <!--xhs--> 标记,处理当前段落并开始新的
|
||||
if current_paragraph and in_xhs_section:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
in_xhs_section = True
|
||||
last_line_was_heading = False # 重置标题标记
|
||||
continue
|
||||
|
||||
if in_xhs_section:
|
||||
# 结束条件:空行或下一个 <!--xhs-->
|
||||
if line.strip() == '':
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
if in_table:
|
||||
xhsdata[filename]['tables'].append(table_rows)
|
||||
table_rows = []
|
||||
in_table = False
|
||||
in_xhs_section = False
|
||||
last_line_was_heading = False # 重置标题标记
|
||||
continue
|
||||
|
||||
# 检查是否为标题行
|
||||
if line.strip().startswith('#'):
|
||||
# 如果当前有累积的段落内容,先添加到content
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
# 将标题单独添加为一个段落
|
||||
xhsdata[filename]['content'].append(line.strip())
|
||||
self.logger.debug(f"添加标题: {line.strip()[:30]}...")
|
||||
last_line_was_heading = True # 标记上一行是标题
|
||||
continue
|
||||
|
||||
# 如果上一行是标题且当前行没有 <!--xhs--> 标识,则忽略当前行
|
||||
if last_line_was_heading:
|
||||
self.logger.debug(f"忽略标题后的内容: {line.strip()[:30]}...")
|
||||
continue
|
||||
|
||||
# 提取图片路径
|
||||
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
|
||||
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
|
||||
for match in image_matches:
|
||||
clean_match = re.sub(r'^(/?img/)', '', match)
|
||||
xhsdata[filename]['images'].append(clean_match)
|
||||
self.logger.debug(f"提取图片: {clean_match}")
|
||||
|
||||
# 去除图片标记后的文本
|
||||
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
|
||||
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
|
||||
|
||||
# 检查是否为表格行
|
||||
if line.strip().startswith('|'):
|
||||
if not in_table:
|
||||
in_table = True
|
||||
row = [cell.strip() for cell in line.strip().strip('|').split('|')]
|
||||
table_rows.append(row)
|
||||
else:
|
||||
if in_table:
|
||||
xhsdata[filename]['tables'].append(table_rows)
|
||||
table_rows = []
|
||||
in_table = False
|
||||
if text:
|
||||
current_paragraph.append(text)
|
||||
last_line_was_heading = False # 重置标题标记
|
||||
|
||||
# 处理最后一个段落
|
||||
if current_paragraph and in_xhs_section:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
self.logger.debug(f"添加最后一个段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
# 处理最后一个表格
|
||||
if in_table:
|
||||
xhsdata[filename]['tables'].append(table_rows)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"解析内容和图片时发生错误")
|
||||
raise
|
||||
|
||||
def render_to_html(self, xhsdata):
|
||||
"""将 xhsdata 内容渲染为 HTML"""
|
||||
html_parts = []
|
||||
|
||||
for filename, data in xhsdata.items():
|
||||
# 添加 xhstitle
|
||||
if 'xhstitle' in data:
|
||||
html_parts.append(f'<h1 style="font-size: 24px;">{data["xhstitle"]}</h1>')
|
||||
|
||||
# 添加 content
|
||||
for item in data.get('content', []):
|
||||
if item.startswith('#'):
|
||||
level = len(item.split(' ')[0])
|
||||
title_text = item.replace('#' * level, '').strip()
|
||||
font_size = 24 - (level - 1) * 2
|
||||
html_parts.append(f'<h{level} style="font-size: {font_size}px;">{title_text}</h{level}>')
|
||||
else:
|
||||
html_parts.append(f'<p style="font-size: 16px;">{item}</p>')
|
||||
|
||||
# 添加 tables
|
||||
for table in data.get('tables', []):
|
||||
html_parts.append('<table border="1">')
|
||||
for row in table:
|
||||
html_parts.append('<tr>')
|
||||
for cell in row:
|
||||
html_parts.append(f'<td style="font-size: 14px;">{cell}</td>')
|
||||
html_parts.append('</tr>')
|
||||
html_parts.append('</table>')
|
||||
|
||||
# 添加空行,让 xhstags 与内容之间空 3 行
|
||||
for _ in range(3): html_parts.append('<br>')
|
||||
|
||||
# 添加 xhstags,每个 tag 单独一行
|
||||
if 'xhstags' in data:
|
||||
for tag in data['xhstags']:
|
||||
html_parts.append(f'<span style="font-size: 14px;">#{tag}</span><br>')
|
||||
|
||||
# 添加 xhssign
|
||||
if 'xhssign' in data:
|
||||
html_parts.append(f'<br><br><br><span style="font-size: 14px;">{data["xhssign"]}</span><br>')
|
||||
|
||||
html = '\n'.join(html_parts)
|
||||
return html
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
parser = MarkdownParser(log_file='markdown_parser.log')
|
||||
try:
|
||||
file_path = 'markdown/test.md'
|
||||
result = parser.parse_markdown_file(file_path)
|
||||
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
|
||||
html_output = parser.render_to_html(result)
|
||||
# 以 GBK 编码保存到文件
|
||||
with open('test.html', 'w', encoding='gbk') as f: f.write(html_output)
|
||||
|
||||
except Exception as e:
|
||||
print(f"程序运行出错: {e}")
|
||||
|
||||
242
backup/parse_markdown_file.py.bk
Normal file
242
backup/parse_markdown_file.py.bk
Normal file
@@ -0,0 +1,242 @@
|
||||
|
||||
#########################################################
|
||||
## @file : parse_markdown_file.py
|
||||
## @desc : parse hugo markdown file
|
||||
## @create : 2025/6/22
|
||||
## @author : Chengan,doubao AI
|
||||
## @email : douboer@gmail.com
|
||||
#########################################################
|
||||
|
||||
|
||||
import logging
|
||||
import json
|
||||
from collections import defaultdict
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
# 配置日志
|
||||
def setup_logger(log_file=None):
|
||||
logger = logging.getLogger('markdown_parser')
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
# 创建控制台处理器
|
||||
console_handler = logging.StreamHandler()
|
||||
console_handler.setLevel(logging.INFO)
|
||||
|
||||
# 创建文件处理器(如果指定了日志文件)
|
||||
if log_file:
|
||||
file_handler = logging.FileHandler(log_file)
|
||||
file_handler.setLevel(logging.DEBUG)
|
||||
logger.addHandler(file_handler)
|
||||
|
||||
# 设置日志格式
|
||||
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
||||
console_handler.setFormatter(formatter)
|
||||
if log_file:
|
||||
file_handler.setFormatter(formatter)
|
||||
|
||||
# 添加处理器
|
||||
logger.addHandler(console_handler)
|
||||
|
||||
return logger
|
||||
|
||||
class MarkdownParser:
|
||||
def __init__(self, log_file=None):
|
||||
self.logger = setup_logger(log_file)
|
||||
self.logger.info("MarkdownParser initialized")
|
||||
|
||||
def parse_markdown_file(self, file_path):
|
||||
"""解析 Markdown 文件,提取元数据和 XHS 内容"""
|
||||
xhsdata = defaultdict(dict)
|
||||
|
||||
try:
|
||||
# 验证文件路径
|
||||
file_path = Path(file_path)
|
||||
if not file_path.exists():
|
||||
self.logger.error(f"文件不存在: {file_path}")
|
||||
raise FileNotFoundError(f"文件不存在: {file_path}")
|
||||
|
||||
if not file_path.is_file():
|
||||
self.logger.error(f"不是有效的文件: {file_path}")
|
||||
raise ValueError(f"不是有效的文件: {file_path}")
|
||||
|
||||
filename = file_path.name
|
||||
self.logger.info(f"开始解析文件: {filename}")
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# 解析 YAML 元数据
|
||||
metadata = self._parse_metadata(lines)
|
||||
|
||||
# 处理标签和分类
|
||||
self._process_tags_categories(metadata, xhsdata, filename)
|
||||
|
||||
# 处理图片路径
|
||||
self._process_image_path(metadata, xhsdata, filename)
|
||||
|
||||
# 添加剩余元数据
|
||||
for key, value in metadata.items():
|
||||
if key not in xhsdata[filename]:
|
||||
xhsdata[filename][key] = value
|
||||
|
||||
# 解析内容和图片
|
||||
self._parse_content_images(lines, xhsdata, filename)
|
||||
|
||||
self.logger.info(f"文件解析完成: {filename}")
|
||||
return xhsdata
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"解析文件时发生错误: {file_path}")
|
||||
raise
|
||||
|
||||
def _parse_metadata(self, lines):
|
||||
"""解析 Markdown 文件中的 YAML 元数据"""
|
||||
metadata = {}
|
||||
current_key = None
|
||||
current_list = []
|
||||
in_front_matter = False
|
||||
|
||||
try:
|
||||
for line in lines:
|
||||
if line.strip() == '---':
|
||||
if in_front_matter:
|
||||
break
|
||||
in_front_matter = True
|
||||
continue
|
||||
if not in_front_matter:
|
||||
continue
|
||||
|
||||
line = line.rstrip('\n') # 去除行尾换行符
|
||||
|
||||
# 处理列表项
|
||||
if line.startswith(' - '):
|
||||
if current_key:
|
||||
current_list.append(line[4:].strip())
|
||||
continue
|
||||
|
||||
# 处理键值对
|
||||
if ':' in line:
|
||||
# 保存之前的列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
current_list = []
|
||||
|
||||
key, *value_parts = line.split(':', 1)
|
||||
key = key.strip()
|
||||
value = value_parts[0].strip() if value_parts else ''
|
||||
|
||||
# 检查是否为多行值的开始
|
||||
if value == '':
|
||||
current_key = key
|
||||
current_list = []
|
||||
else:
|
||||
metadata[key] = value
|
||||
current_key = None
|
||||
|
||||
# 保存最后一个列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
|
||||
self.logger.debug(f"解析元数据完成: {metadata}")
|
||||
return metadata
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception("解析元数据时发生错误")
|
||||
raise
|
||||
|
||||
def _process_tags_categories(self, metadata, xhsdata, filename):
|
||||
"""处理标签和分类字段"""
|
||||
try:
|
||||
for field in ['tags', 'xhstags', 'categories']:
|
||||
if field in metadata:
|
||||
value = metadata[field]
|
||||
if isinstance(value, str):
|
||||
# 处理 "[标签1,标签2]" 格式
|
||||
if value.startswith('[') and value.endswith(']'):
|
||||
value = value[1:-1].replace('"', '').split(',')
|
||||
value = [tag.strip() for tag in value if tag.strip()]
|
||||
else:
|
||||
value = [value]
|
||||
xhsdata[filename][field] = value
|
||||
self.logger.debug(f"处理 {field}: {value}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"处理标签/分类时发生错误")
|
||||
raise
|
||||
|
||||
def _process_image_path(self, metadata, xhsdata, filename):
|
||||
"""处理图片路径,移除/img/或img/前缀"""
|
||||
try:
|
||||
if 'image' in metadata:
|
||||
image_path = metadata['image']
|
||||
clean_image = re.sub(r'^(/?img/)', '', image_path)
|
||||
xhsdata[filename]['image'] = clean_image
|
||||
self.logger.debug(f"处理图片路径: {image_path} -> {clean_image}")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"处理图片路径时发生错误")
|
||||
raise
|
||||
|
||||
def _parse_content_images(self, lines, xhsdata, filename):
|
||||
"""解析内容和图片"""
|
||||
try:
|
||||
xhsdata[filename]['content'] = []
|
||||
xhsdata[filename]['images'] = []
|
||||
in_xhs_section = False
|
||||
current_paragraph = []
|
||||
|
||||
for line in lines:
|
||||
if line.strip() == '<!--xhs-->':
|
||||
# 遇到新的 <!--xhs--> 标记,处理当前段落并开始新的
|
||||
if current_paragraph and in_xhs_section:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
in_xhs_section = True
|
||||
continue
|
||||
|
||||
if in_xhs_section:
|
||||
# 结束条件:空行或下一个 <!--xhs-->
|
||||
if line.strip() == '':
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
self.logger.debug(f"添加段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
in_xhs_section = False
|
||||
continue
|
||||
|
||||
# 提取图片路径
|
||||
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
|
||||
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
|
||||
for match in image_matches:
|
||||
clean_match = re.sub(r'^(/?img/)', '', match)
|
||||
xhsdata[filename]['images'].append(clean_match)
|
||||
self.logger.debug(f"提取图片: {clean_match}")
|
||||
|
||||
# 去除图片标记后的文本
|
||||
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
|
||||
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
|
||||
if text:
|
||||
current_paragraph.append(text)
|
||||
|
||||
# 处理最后一个段落
|
||||
if current_paragraph and in_xhs_section:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
self.logger.debug(f"添加最后一个段落: {xhsdata[filename]['content'][-1][:30]}...")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.exception(f"解析内容和图片时发生错误")
|
||||
raise
|
||||
|
||||
# 示例使用
|
||||
if __name__ == "__main__":
|
||||
parser = MarkdownParser(log_file='markdown_parser.log')
|
||||
try:
|
||||
file_path = 'markdown/test.md'
|
||||
result = parser.parse_markdown_file(file_path)
|
||||
print(json.dumps(result, indent=2, ensure_ascii=False))
|
||||
except Exception as e:
|
||||
print(f"程序运行出错: {e}")
|
||||
|
||||
|
||||
141
backup/parse_markdown_file.py.bk2
Normal file
141
backup/parse_markdown_file.py.bk2
Normal file
@@ -0,0 +1,141 @@
|
||||
#########################################################
|
||||
## @file : parse_markdown_file.py
|
||||
## @desc : parse hugo markdown file
|
||||
## @create : 2025/6/22
|
||||
## @author : Chengan,doubao AI
|
||||
## @email : douboer@gmail.com
|
||||
#########################################################
|
||||
|
||||
from collections import defaultdict
|
||||
import re
|
||||
import json
|
||||
|
||||
def parse_markdown_file(file_path):
|
||||
xhsdata = defaultdict(dict)
|
||||
filename = file_path.split('/')[-1]
|
||||
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# 解析 YAML 元数据
|
||||
metadata = {}
|
||||
current_key = None
|
||||
current_list = []
|
||||
in_front_matter = False
|
||||
|
||||
for line in lines:
|
||||
if line.strip() == '---':
|
||||
if in_front_matter:
|
||||
break
|
||||
in_front_matter = True
|
||||
continue
|
||||
if not in_front_matter:
|
||||
continue
|
||||
|
||||
line = line.rstrip('\n') # 去除行尾换行符
|
||||
|
||||
# 处理列表项
|
||||
if line.startswith(' - '):
|
||||
if current_key:
|
||||
current_list.append(line[4:].strip())
|
||||
continue
|
||||
|
||||
# 处理键值对
|
||||
if ':' in line:
|
||||
# 保存之前的列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
current_list = []
|
||||
|
||||
key, *value_parts = line.split(':', 1)
|
||||
key = key.strip()
|
||||
value = value_parts[0].strip() if value_parts else ''
|
||||
|
||||
# 检查是否为多行值的开始
|
||||
if value == '':
|
||||
current_key = key
|
||||
current_list = []
|
||||
else:
|
||||
metadata[key] = value
|
||||
current_key = None
|
||||
|
||||
# 保存最后一个列表项
|
||||
if current_key and current_list:
|
||||
metadata[current_key] = current_list
|
||||
|
||||
# 处理标签和分类
|
||||
for field in ['tags', 'xhstags', 'categories']:
|
||||
if field in metadata:
|
||||
value = metadata[field]
|
||||
if isinstance(value, str):
|
||||
# 处理 "[标签1,标签2]" 格式
|
||||
if value.startswith('[') and value.endswith(']'):
|
||||
value = value[1:-1].replace('"', '').split(',')
|
||||
value = [tag.strip() for tag in value if tag.strip()]
|
||||
else:
|
||||
value = [value]
|
||||
xhsdata[filename][field] = value
|
||||
|
||||
# 处理图片路径(重点优化)
|
||||
if 'image' in metadata:
|
||||
image_path = metadata['image']
|
||||
# 移除开头的 /img/ 或 img/,保留后续路径
|
||||
clean_image = re.sub(r'^(/?img/)', '', image_path)
|
||||
xhsdata[filename]['image'] = clean_image
|
||||
|
||||
# 将剩余元数据添加到结果中
|
||||
for key, value in metadata.items():
|
||||
if key not in xhsdata[filename]:
|
||||
xhsdata[filename][key] = value
|
||||
|
||||
# 解析内容和图片
|
||||
xhsdata[filename]['content'] = []
|
||||
xhsdata[filename]['images'] = []
|
||||
in_xhs_section = False
|
||||
current_paragraph = []
|
||||
|
||||
for line in lines:
|
||||
if line.strip() == '<!--xhs-->':
|
||||
# 遇到新的 <!--xhs--> 标记,处理当前段落并开始新的
|
||||
if current_paragraph and in_xhs_section:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
in_xhs_section = True
|
||||
continue
|
||||
'''
|
||||
if line.strip() == '<!--xhs-->':
|
||||
in_xhs_section = True
|
||||
continue
|
||||
'''
|
||||
if in_xhs_section:
|
||||
if line.strip() == '':
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
current_paragraph = []
|
||||
in_xhs_section = False
|
||||
else:
|
||||
# 提取图片路径
|
||||
image_matches = re.findall(r'!\[.*?\]\((.*?)\)', line)
|
||||
image_matches.extend(re.findall(r'!\[\[(.*?)\]\]', line))
|
||||
for match in image_matches:
|
||||
clean_match = re.sub(r'^(/|img/)', '', match)
|
||||
xhsdata[filename]['images'].append(clean_match)
|
||||
|
||||
# 去除图片标记后的文本
|
||||
text = re.sub(r'!\[.*?\]\(.*?\)', '', line).strip()
|
||||
text = re.sub(r'!\[\[.*?\]\]', '', text).strip()
|
||||
if text:
|
||||
current_paragraph.append(text)
|
||||
|
||||
# 处理最后一个段落
|
||||
if current_paragraph:
|
||||
xhsdata[filename]['content'].append(' '.join(current_paragraph))
|
||||
|
||||
return xhsdata
|
||||
|
||||
# 修改文件路径
|
||||
file_path = 'markdown/test.md'
|
||||
result = parse_markdown_file(file_path)
|
||||
print(json.dumps(result,indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
237
backup/pretty_html_table.py
Normal file
237
backup/pretty_html_table.py
Normal file
@@ -0,0 +1,237 @@
|
||||
import io
|
||||
|
||||
# Reformat table_color as dict of tuples
|
||||
|
||||
dict_colors = {
|
||||
'yellow_light' : ('#BF8F00', '2px solid #BF8F00', '#FFF2CC', '#FFFFFF'),
|
||||
'grey_light' : ('#808080', '2px solid #808080', '#EDEDED', '#FFFFFF'),
|
||||
'blue_light' : ('#305496', '2px solid #305496', '#D9E1F2', '#FFFFFF'),
|
||||
'orange_light' : ('#C65911', '2px solid #C65911', '#FCE4D6', '#FFFFFF'),
|
||||
'green_light' : ('#548235', '2px solid #548235', '#E2EFDA', '#FFFFFF'),
|
||||
'red_light' : ('#823535', '2px solid #823535', '#efdada', '#FFFFFF'),
|
||||
'yellow_dark' : ('#FFFFFF', '2px solid #BF8F00', '#FFF2CC', '#BF8F00'),
|
||||
'grey_dark' : ('#FFFFFF', '2px solid #808080', '#EDEDED', '#808080'),
|
||||
'blue_dark': ('#FFFFFF', '2px solid #305496', '#D9E1F2', '#305496'),
|
||||
'orange_dark' : ('#FFFFFF', '2px solid #C65911', '#FCE4D6', '#C65911'),
|
||||
'green_dark' : ('#FFFFFF', '2px solid #548235', '#E2EFDA', '#548235'),
|
||||
'red_dark' : ('#FFFFFF', '2px solid #823535', '#efdada', '#823535')
|
||||
}
|
||||
|
||||
|
||||
def build_table(
|
||||
df,
|
||||
color,
|
||||
font_size='medium',
|
||||
font_family='Century Gothic, sans-serif',
|
||||
text_align='left',
|
||||
width='auto',
|
||||
index=False,
|
||||
even_color='black',
|
||||
even_bg_color='white',
|
||||
odd_bg_color=None,
|
||||
border_bottom_color=None,
|
||||
escape=True,
|
||||
width_dict=[],
|
||||
padding="0px 20px 0px 0px",
|
||||
float_format=None,
|
||||
conditions={}):
|
||||
|
||||
if df.empty:
|
||||
return ''
|
||||
|
||||
# Set color
|
||||
color, border_bottom, odd_background_color, header_background_color = dict_colors[color]
|
||||
|
||||
if odd_bg_color:
|
||||
odd_background_color = odd_bg_color
|
||||
|
||||
if border_bottom_color:
|
||||
border_bottom = border_bottom_color
|
||||
|
||||
a = 0
|
||||
while a != len(df):
|
||||
if a == 0:
|
||||
df_html_output = df.iloc[[a]].to_html(
|
||||
na_rep="",
|
||||
index=index,
|
||||
border=0,
|
||||
escape=escape,
|
||||
float_format=float_format,
|
||||
)
|
||||
# change format of header
|
||||
if index:
|
||||
df_html_output = df_html_output.replace('<th>'
|
||||
,'<th style = "background-color: ' + header_background_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';color: ' + color
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';border-bottom: ' + border_bottom
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">', len(df.columns)+1)
|
||||
|
||||
df_html_output = df_html_output.replace('<th>'
|
||||
,'<th style = "background-color: ' + odd_background_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
|
||||
else:
|
||||
df_html_output = df_html_output.replace('<th>'
|
||||
,'<th style = "background-color: ' + header_background_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';color: ' + color
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';border-bottom: ' + border_bottom
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
|
||||
#change format of table
|
||||
df_html_output = df_html_output.replace('<td>'
|
||||
,'<td style = "background-color: ' + odd_background_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
body = """<p>""" + format(df_html_output)
|
||||
|
||||
a = 1
|
||||
|
||||
elif a % 2 == 0:
|
||||
df_html_output = df.iloc[[a]].to_html(na_rep = "", index = index, header = False, escape=escape)
|
||||
|
||||
# change format of index
|
||||
df_html_output = df_html_output.replace('<th>'
|
||||
,'<th style = "background-color: ' + odd_background_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
|
||||
#change format of table
|
||||
df_html_output = df_html_output.replace('<td>'
|
||||
,'<td style = "background-color: ' + odd_background_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
|
||||
body = body + format(df_html_output)
|
||||
|
||||
a += 1
|
||||
|
||||
elif a % 2 != 0:
|
||||
df_html_output = df.iloc[[a]].to_html(na_rep = "", index = index, header = False, escape=escape)
|
||||
|
||||
# change format of index
|
||||
df_html_output = df_html_output.replace('<th>'
|
||||
,'<th style = "background-color: ' + even_bg_color
|
||||
+ '; color: ' + even_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
|
||||
#change format of table
|
||||
df_html_output = df_html_output.replace('<td>'
|
||||
,'<td style = "background-color: ' + even_bg_color
|
||||
+ '; color: ' + even_color
|
||||
+ ';font-family: ' + font_family
|
||||
+ ';font-size: ' + str(font_size)
|
||||
+ ';text-align: ' + text_align
|
||||
+ ';padding: ' + padding
|
||||
+ ';width: ' + str(width) + '">')
|
||||
body = body + format(df_html_output)
|
||||
|
||||
a += 1
|
||||
|
||||
body = body + """</p>"""
|
||||
|
||||
body = body.replace("""</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
<table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>""","""</td>
|
||||
</tr>
|
||||
<tr>""").replace("""</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table><table border="1" class="dataframe">
|
||||
<tbody>
|
||||
<tr>""","""</td>
|
||||
</tr>
|
||||
<tr>""")
|
||||
|
||||
if conditions:
|
||||
for k in conditions.keys():
|
||||
try:
|
||||
conditions[k]['index'] = list(df.columns).index(k)
|
||||
width_body = ''
|
||||
w = 0
|
||||
for line in io.StringIO(body):
|
||||
updated_body = False
|
||||
if w == conditions[k]['index']:
|
||||
try:
|
||||
if int(repr(line).split('>')[1].split('<')[0]) < conditions[k]['min']:
|
||||
if 'color: black' in repr(line):
|
||||
width_body = width_body + repr(line).replace("color: black", 'color: ' + conditions[k]['min_color'])[1:]
|
||||
elif 'color: white' in repr(line):
|
||||
width_body = width_body + repr(line).replace("color: white", 'color: ' + conditions[k]['min_color'])[1:]
|
||||
else:
|
||||
width_body = width_body + repr(line).replace('">', '; color: ' + conditions[k]['min_color'] + '">')[1:]
|
||||
updated_body = True
|
||||
elif int(repr(line).split('>')[1].split('<')[0]) > conditions[k]['max']:
|
||||
if 'color: black' in repr(line):
|
||||
width_body = width_body + repr(line).replace("color: black", 'color: ' + conditions[k]['max_color'])[1:]
|
||||
elif 'color: white' in repr(line):
|
||||
width_body = width_body + repr(line).replace("color: white", 'color: ' + conditions[k]['max_color'])[1:]
|
||||
else:
|
||||
width_body = width_body + repr(line).replace('">', '; color: ' + conditions[k]['max_color'] + '">')[1:]
|
||||
updated_body = True
|
||||
except:
|
||||
pass
|
||||
if not updated_body:
|
||||
width_body = width_body + repr(line)[1:]
|
||||
|
||||
if str(repr(line))[:10] == "' <td" or str(repr(line))[:10] == "' <th":
|
||||
if w == len(df.columns) -1:
|
||||
w = 0
|
||||
else:
|
||||
w += 1
|
||||
body = width_body[:len(width_body)-1]
|
||||
except:
|
||||
pass
|
||||
|
||||
if len(width_dict) == len(df.columns):
|
||||
width_body = ''
|
||||
w = 0
|
||||
if conditions:
|
||||
for line in body.split(r"\n'"):
|
||||
line = line.replace("\n", "")
|
||||
width_body = width_body + repr(line).replace("width: auto", 'width: ' + width_dict[w])[1:]
|
||||
if str(repr(line))[:10] == "' <td" or str(repr(line))[:10] == "' <th" :
|
||||
if w == len(df.columns) -1:
|
||||
w = 0
|
||||
else:
|
||||
w += 1
|
||||
else:
|
||||
for line in io.StringIO(body):
|
||||
line = line.replace("\n", "")
|
||||
width_body = width_body + repr(line).replace("width: auto", 'width: ' + width_dict[w])[1:]
|
||||
if str(repr(line))[:10] == "' <td" or str(repr(line))[:10] == "' <th" :
|
||||
if w == len(df.columns) -1:
|
||||
w = 0
|
||||
else:
|
||||
w += 1
|
||||
return width_body[:len(width_body)-1].replace("'", "")
|
||||
else:
|
||||
return body.replace(r"\n'", "")
|
||||
43
backup/push_files.py
Normal file
43
backup/push_files.py
Normal file
@@ -0,0 +1,43 @@
|
||||
|
||||
import subprocess
|
||||
|
||||
def get_uncommitted_changes():
|
||||
"""获取所有未提交的修改,并区分新增、删除、修改状态"""
|
||||
changes = {
|
||||
'added': [],
|
||||
'deleted': [],
|
||||
'modified': []
|
||||
}
|
||||
|
||||
try:
|
||||
# 获取工作区和暂存区的完整状态
|
||||
status_output = subprocess.check_output(
|
||||
["git", "status", "--porcelain"],
|
||||
text=True,
|
||||
stderr=subprocess.DEVNULL
|
||||
).splitlines()
|
||||
|
||||
for line in status_output:
|
||||
status_code = line[:2].strip()
|
||||
file_path = line[3:].strip()
|
||||
|
||||
if status_code == '??':
|
||||
changes['added'].append(file_path)
|
||||
elif status_code == 'D':
|
||||
changes['deleted'].append(file_path)
|
||||
elif status_code in ('M', 'AM', 'MD'):
|
||||
changes['modified'].append(file_path)
|
||||
|
||||
except subprocess.CalledProcessError:
|
||||
pass
|
||||
|
||||
return changes
|
||||
|
||||
# 使用示例
|
||||
changes = get_uncommitted_changes()
|
||||
for status, files in changes.items():
|
||||
if files:
|
||||
print(f"\n{status.upper()}:")
|
||||
for file in files:
|
||||
print(f" - {file}")
|
||||
|
||||
326
backup/txt_to_image.py
Normal file
326
backup/txt_to_image.py
Normal file
@@ -0,0 +1,326 @@
|
||||
#########################################################
|
||||
## @file : txt_to_image.py
|
||||
## @desc : text content convert to image
|
||||
## @create : 2025/6/22
|
||||
## @author : Chengan,doubao AI
|
||||
## @email : douboer@gmail.com
|
||||
#########################################################
|
||||
|
||||
from logger_utils import CommonLogger
|
||||
from PIL import Image, ImageDraw, ImageFont, ImageFilter
|
||||
import textwrap
|
||||
import random
|
||||
import os
|
||||
from config import fonts_path, templates, styles
|
||||
|
||||
class TextToImage:
|
||||
"""将文字内容转换为美观图片的类"""
|
||||
def __init__(self, log_file=None):
|
||||
"""初始化字体和模板路径"""
|
||||
self.logger = CommonLogger(log_file).get_logger()
|
||||
self.fonts_path = fonts_path
|
||||
self.templates = templates
|
||||
self.target_size = (1244, 1660) # 目标尺寸:1244x1660
|
||||
|
||||
def create_image(self, title, content, subtitle=None, signature=None,
|
||||
template='minimal',
|
||||
output_path='output.png'):
|
||||
"""创建文字图片"""
|
||||
try:
|
||||
# 确保模板存在
|
||||
if template not in self.templates:
|
||||
raise ValueError(f"模板 '{template}' 不存在")
|
||||
template_data = self.templates[template]
|
||||
|
||||
# 处理背景图片尺寸
|
||||
background = self._process_background(template_data['background'])
|
||||
draw = ImageDraw.Draw(background)
|
||||
width, height = background.size
|
||||
padding = template_data['padding']
|
||||
|
||||
# 从config获取styles
|
||||
title_style = styles['title']
|
||||
subtitle_style = styles['subtitle']
|
||||
body_style = styles['body']
|
||||
signature_style = styles['signature']
|
||||
top_offset = styles['top_offset']
|
||||
bottom_offset = styles['bottom_offset']
|
||||
paragraph_spacing = styles['paragraph_spacing'] # 获取段间距
|
||||
|
||||
# 检查字体文件是否存在
|
||||
for key, path in self.fonts_path.items():
|
||||
if not os.path.exists(path):
|
||||
raise FileNotFoundError(f"字体文件 '{path}' 不存在,请确认已放入fonts目录")
|
||||
|
||||
# 加载字体
|
||||
title_font = ImageFont.truetype(self.fonts_path['title'], title_style['size'])
|
||||
subtitle_font = ImageFont.truetype(self.fonts_path['subtitle'], subtitle_style['size'])
|
||||
body_font = ImageFont.truetype(self.fonts_path['body'], body_style['size'])
|
||||
signature_font = ImageFont.truetype(self.fonts_path['signature'], signature_style['size'])
|
||||
|
||||
# 绘制标题
|
||||
title_x = (width - self._get_text_width(title, title_font, title_style['letter_spacing'])) // 2
|
||||
title_y = padding + title_style['top_spacing'] # 使用 top_spacing
|
||||
self._draw_text_with_spacing(draw, (title_x, title_y), title,
|
||||
title_font, title_style['color'], title_style['letter_spacing'])
|
||||
|
||||
# 计算标题高度
|
||||
title_bbox = draw.textbbox((0, 0), title, font=title_font)
|
||||
title_height = title_bbox[3] - title_bbox[1]
|
||||
|
||||
# 绘制副标题
|
||||
current_y = title_y + title_height + title_style['bottom_spacing']
|
||||
if subtitle:
|
||||
subtitle_x = (width - self._get_text_width(subtitle, subtitle_font, subtitle_style['letter_spacing'])) // 2
|
||||
self._draw_text_with_spacing(draw, (subtitle_x, current_y), subtitle,
|
||||
subtitle_font, subtitle_style['color'], subtitle_style['letter_spacing'])
|
||||
subtitle_bbox = draw.textbbox((0, 0), subtitle, font=subtitle_font)
|
||||
subtitle_height = subtitle_bbox[3] - subtitle_bbox[1]
|
||||
current_y += subtitle_height + subtitle_style['bottom_spacing']
|
||||
|
||||
# 分割内容为段落
|
||||
paragraphs = content.split('\n\n')
|
||||
page_num = 1
|
||||
output_paths = []
|
||||
for i, paragraph in enumerate(paragraphs):
|
||||
if i > 0: # 如果不是第一个段落,添加段间距
|
||||
current_y += paragraph_spacing
|
||||
|
||||
if current_y + self._get_paragraph_height(paragraph, body_font, body_style) > height - bottom_offset:
|
||||
# 绘制签名
|
||||
if signature:
|
||||
self._draw_signature(draw, width, height, padding, signature, signature_font, signature_style)
|
||||
# 保存当前图片
|
||||
output_path_current = output_path.replace('.png', f'_{page_num}.png')
|
||||
background.save(output_path_current)
|
||||
self.logger.info(f"图片已保存至: {output_path_current}")
|
||||
output_paths.append(output_path_current)
|
||||
# 创建新的图片
|
||||
background = self._process_background(template_data['background'])
|
||||
draw = ImageDraw.Draw(background)
|
||||
current_y = top_offset
|
||||
page_num += 1
|
||||
|
||||
# 绘制段落
|
||||
max_width = width - padding * 2
|
||||
wrapped_text = self._wrap_text(paragraph, body_font, max_width, body_style['letter_spacing'])
|
||||
self._draw_multiline_text_with_spacing(
|
||||
draw, (padding, current_y), wrapped_text,
|
||||
body_font, body_style['color'],
|
||||
body_style['letter_spacing'], body_style['line_spacing']
|
||||
)
|
||||
|
||||
# 计算段落高度
|
||||
text_bbox = draw.textbbox((0, 0), wrapped_text, font=body_font)
|
||||
text_height = (text_bbox[3] - text_bbox[1]) + (body_style['line_spacing'] * (wrapped_text.count('\n') or 1))
|
||||
current_y += text_height + body_style['line_spacing']
|
||||
|
||||
# 绘制签名
|
||||
if signature:
|
||||
self._draw_signature(draw, width, height, padding, signature, signature_font, signature_style)
|
||||
|
||||
# 保存最后一张图片
|
||||
output_path_current = output_path.replace('.png', f'_{page_num}.png')
|
||||
background.save(output_path_current)
|
||||
self.logger.info(f"图片已保存至: {output_path_current}")
|
||||
output_paths.append(output_path_current)
|
||||
|
||||
return output_paths
|
||||
except Exception as e:
|
||||
self.logger.exception(f"创建图片时发生错误: {e}")
|
||||
return None
|
||||
|
||||
def _get_text_width(self, text, font, letter_spacing):
|
||||
"""计算包含字间距的文本总宽度"""
|
||||
if not text:
|
||||
return 0
|
||||
bbox = font.getbbox(text)
|
||||
return bbox[2] - bbox[0] + (letter_spacing * (len(text) - 1))
|
||||
|
||||
def _draw_text_with_spacing(self, draw, position, text, font, fill, letter_spacing):
|
||||
"""绘制包含字间距的文本"""
|
||||
x, y = position
|
||||
for char in text:
|
||||
draw.text((x, y), char, font=font, fill=fill)
|
||||
# 获取字符宽度并加上字间距
|
||||
char_width = font.getlength(char)
|
||||
x += char_width + letter_spacing
|
||||
|
||||
def _draw_multiline_text_with_spacing(self, draw, position, text, font, fill,
|
||||
letter_spacing, line_spacing):
|
||||
"""绘制包含字间距和行间距的多行文本"""
|
||||
x, y = position
|
||||
lines = text.split('\n')
|
||||
for line in lines:
|
||||
self._draw_text_with_spacing(draw, (x, y), line, font, fill, letter_spacing)
|
||||
# 获取行高并加上行间距
|
||||
line_bbox = draw.textbbox((0, 0), line, font=font)
|
||||
line_height = line_bbox[3] - line_bbox[1]
|
||||
y += line_height + line_spacing
|
||||
|
||||
def _add_decorations(self, draw, width, height, template_data):
|
||||
"""添加装饰元素"""
|
||||
padding = template_data['padding']
|
||||
color = (30, 30, 30) # 默认装饰颜色
|
||||
# 顶部和底部添加细线条
|
||||
#draw.line([(padding, padding//2), (width-padding, padding//2)], fill=color, width=1)
|
||||
#draw.line([(padding, height-padding//2), (width-padding, height-padding//2)], fill=color, width=1)
|
||||
|
||||
# 添加几何装饰点
|
||||
for _ in range(8):
|
||||
x = random.randint(padding, width-padding)
|
||||
y = random.randint(padding, height-padding)
|
||||
draw.ellipse([(x-2, y-2), (x+2, y+2)], fill=color)
|
||||
|
||||
def _wrap_text(self, text, font, max_width, letter_spacing):
|
||||
"""根据最大宽度和字间距智能换行"""
|
||||
if not text:
|
||||
return ""
|
||||
|
||||
avg_char_width = font.getlength('a') + letter_spacing
|
||||
max_chars_per_line = int(max_width // avg_char_width)
|
||||
|
||||
wrapped_text = ""
|
||||
lines = text.split('\n')
|
||||
|
||||
for line in lines:
|
||||
if not line:
|
||||
wrapped_text += "\n\n"
|
||||
continue
|
||||
|
||||
words = list(line)
|
||||
current_line = ""
|
||||
current_width = 0
|
||||
|
||||
for word in words:
|
||||
word_width = font.getlength(word) + letter_spacing
|
||||
if current_width + word_width > max_width:
|
||||
if current_line:
|
||||
wrapped_text += current_line + "\n"
|
||||
current_line = word
|
||||
current_width = word_width
|
||||
else:
|
||||
wrapped_text += word + "\n"
|
||||
current_line = ""
|
||||
current_width = 0
|
||||
else:
|
||||
current_line += word
|
||||
current_width += word_width
|
||||
if current_line:
|
||||
wrapped_text += current_line + "\n"
|
||||
|
||||
return wrapped_text.strip()
|
||||
|
||||
def _process_background(self, background_path):
|
||||
"""处理背景图片,将宽度调整为1440并等比例缩放高度,然后裁剪或扩展为目标尺寸1244x1660"""
|
||||
target_width, target_height = self.target_size
|
||||
try:
|
||||
# 打开背景图片
|
||||
background = Image.open(background_path).convert("RGBA")
|
||||
bg_width, bg_height = background.size
|
||||
|
||||
# 调整宽度为1440并等比例缩放高度
|
||||
if bg_width != 1440:
|
||||
ratio = 1440 / bg_width
|
||||
new_height = int(bg_height * ratio)
|
||||
background = background.resize((1440, new_height), Image.LANCZOS)
|
||||
|
||||
bg_width, bg_height = background.size
|
||||
|
||||
# 如果图片尺寸大于目标尺寸,进行裁剪
|
||||
if bg_width >= target_width and bg_height >= target_height:
|
||||
# 计算裁剪区域(居中裁剪)
|
||||
left = (bg_width - target_width) // 2
|
||||
top = (bg_height - target_height) // 2
|
||||
right = left + target_width
|
||||
bottom = top + target_height
|
||||
background = background.crop((left, top, right, bottom))
|
||||
|
||||
# 如果图片尺寸小于目标尺寸,进行扩展(白色填充)
|
||||
else:
|
||||
new_background = Image.new("RGBA", self.target_size, (255, 255, 255, 255))
|
||||
# 计算粘贴位置(居中)
|
||||
paste_x = (target_width - bg_width) // 2
|
||||
paste_y = (target_height - bg_height) // 2
|
||||
new_background.paste(background, (paste_x, paste_y))
|
||||
background = new_background
|
||||
|
||||
except Exception as e:
|
||||
# 如果背景图片处理失败,创建默认尺寸的白色背景
|
||||
self.logger.exception(f"背景图片处理失败: {e}")
|
||||
background = Image.new("RGBA", self.target_size, (255, 255, 255, 255))
|
||||
|
||||
return background
|
||||
|
||||
def _get_paragraph_height(self, paragraph, font, style):
|
||||
"""计算段落的高度"""
|
||||
max_width = self.target_size[0] - 2 * self.templates['minimal']['padding']
|
||||
wrapped_text = self._wrap_text(paragraph, font, max_width, style['letter_spacing'])
|
||||
text_bbox = ImageDraw.Draw(Image.new("RGBA", self.target_size)).textbbox((0, 0), wrapped_text, font=font)
|
||||
text_height = (text_bbox[3] - text_bbox[1]) + (style['line_spacing'] * (wrapped_text.count('\n') or 1))
|
||||
return text_height
|
||||
|
||||
def _draw_signature(self, draw, width, height, padding, signature, signature_font, signature_style):
|
||||
"""绘制签名"""
|
||||
signature_width = self._get_text_width(signature, signature_font, signature_style['letter_spacing'])
|
||||
signature_bbox = draw.textbbox((0, 0), signature, font=signature_font)
|
||||
signature_height = signature_bbox[3] - signature_bbox[1]
|
||||
|
||||
# 根据配置确定签名垂直位置
|
||||
if signature_style['vertical_position'] == 'bottom':
|
||||
# 固定在底部
|
||||
signature_y = height - padding - signature_height - signature_style['offset']
|
||||
else:
|
||||
# 跟随内容流动(原逻辑)
|
||||
signature_y = current_y + 40
|
||||
|
||||
# 根据配置确定签名水平位置
|
||||
if signature_style['position'] == 'left':
|
||||
signature_x = padding
|
||||
elif signature_style['position'] == 'center':
|
||||
signature_x = (width - signature_width) // 2
|
||||
else: # 'right'
|
||||
signature_x = width - padding - signature_width
|
||||
|
||||
self._draw_text_with_spacing(draw, (signature_x, signature_y), signature,
|
||||
signature_font, signature_style['color'], signature_style['letter_spacing'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 示例内容
|
||||
title = "夏日阅读清单"
|
||||
subtitle = "2025年必读书单推荐"
|
||||
content = """日本寺院有抽签的风尚,大概也是收入不菲的创收项目吧。一个个签位,标了价格,无人售票,这种方式挺好,没有买卖的功利和压力。
|
||||
|
||||
不免俗,抽了支,下下,中间有句,大意是人财分离。虽不作兴这个,还是略有不悦。
|
||||
|
||||
逛完突然想着去吃抹茶小点,老杨发现转角处就有一家抹茶专业店,可惜刚打烊。边上小店点了一个撒着浅草字样的冰激凌,无惊喜。
|
||||
|
||||
途经一家电玩店,别有洞天,一排排整齐划一,机器挨着机器,每一排机器造型不同,应该超过500台,空间被利用到极致。充斥着机器电音,令人想起工业朋
|
||||
克。一簇簇坐着的玩家,电子烟是标配,60%是中老年人,秃头大爷和白发老奶不少见,并无少年,颠覆我对游戏厅的印象。
|
||||
|
||||
这里机器是掌控者,人是机器的有机体延伸,机器设定规则,发出吼叫,刺激神经,填满有机体空虚孤独的心。比起从站台跳下去,被机器奴役也并不算太差,这样想来,风月场也好,游戏厅也好,经营者都有功德。"""
|
||||
signature = "@刀波儿"
|
||||
|
||||
# 确保目录存在
|
||||
os.makedirs('fonts', exist_ok=True)
|
||||
os.makedirs('backgrounds', exist_ok=True)
|
||||
|
||||
# 创建简单背景(如果模板不存在)
|
||||
if not os.path.exists('backgrounds/minimal.jpg'):
|
||||
img = Image.new('RGB', (1244, 1660), color=(240, 240, 240))
|
||||
draw = ImageDraw.Draw(img)
|
||||
for i in range(0, 1660, 20):
|
||||
draw.line([(0, i), (1244, i)], fill=(230, 230, 230))
|
||||
img.save('backgrounds/minimal.jpg')
|
||||
|
||||
# 生成图片
|
||||
converter = TextToImage(log_file='logs/txt_to_image.log')
|
||||
output_paths = converter.create_image(
|
||||
title=title,
|
||||
subtitle=subtitle,
|
||||
content=content,
|
||||
signature=signature,
|
||||
template='minimal',
|
||||
output_path='temp/reading_list.png'
|
||||
)
|
||||
print("生成的图片路径:", output_paths)
|
||||
Reference in New Issue
Block a user