欢迎光临合肥九九信息网
详情描述
Python实现Word文档自动化排版的完整流程

python-docx这个强大的库来实现。

一、环境准备

pip install python-docx
pip install lxml  # 依赖项

二、基础文档创建与排版

from docx import Document
from docx.shared import Pt, Inches, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.style import WD_STYLE_TYPE

def create_document_with_formatting():
    """创建文档并应用基础排版"""
    doc = Document()

    # 1. 设置文档属性
    doc.core_properties.author = "张三"
    doc.core_properties.title = "自动化排版示例文档"

    # 2. 添加标题
    title = doc.add_heading('Python自动化排版报告', 0)
    title.alignment = WD_ALIGN_PARAGRAPH.CENTER

    # 3. 添加副标题
    subtitle = doc.add_heading('技术文档 v1.0', 1)
    subtitle.alignment = WD_ALIGN_PARAGRAPH.CENTER

    # 4. 添加段落
    para = doc.add_paragraph('这是使用Python自动化生成的文档内容。')

    # 5. 格式化文本
    run = para.add_run('这是一段加粗的文字。')
    run.bold = True

    run = para.add_run(' 这是一段红色的文字。')
    run.font.color.rgb = RGBColor(255, 0, 0)

    # 6. 设置字体
    font = run.font
    font.name = '微软雅黑'
    font.size = Pt(12)

    return doc

三、完整排版系统实现

from docx.shared import Cm, Mm
from docx.oxml.ns import qn
from docx.enum.table import WD_TABLE_ALIGNMENT
from datetime import datetime
import os

class WordAutoFormatter:
    """Word文档自动化排版类"""

    def __init__(self, template_path=None):
        """
        初始化
        :param template_path: 模板文档路径(可选)
        """
        if template_path and os.path.exists(template_path):
            self.doc = Document(template_path)
        else:
            self.doc = Document()

        self._setup_page_settings()
        self._create_styles()

    def _setup_page_settings(self):
        """设置页面布局"""
        sections = self.doc.sections
        for section in sections:
            # 设置页边距
            section.top_margin = Cm(2.54)
            section.bottom_margin = Cm(2.54)
            section.left_margin = Cm(3.17)
            section.right_margin = Cm(3.17)

            # 设置页面大小(A4)
            section.page_width = Cm(21)
            section.page_height = Cm(29.7)

    def _create_styles(self):
        """创建自定义样式"""
        # 标题1样式
        styles = self.doc.styles
        if 'CustomTitle1' not in styles:
            style = styles.add_style('CustomTitle1', WD_STYLE_TYPE.PARAGRAPH)
            style.font.name = '黑体'
            style.font.size = Pt(16)
            style.font.bold = True
            style.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER
            style.paragraph_format.space_after = Pt(12)

        # 正文样式
        if 'CustomBody' not in styles:
            style = styles.add_style('CustomBody', WD_STYLE_TYPE.PARAGRAPH)
            style.font.name = '宋体'
            style.font.size = Pt(12)
            style.paragraph_format.line_spacing = 1.5
            style.paragraph_format.first_line_indent = Cm(0.74)  # 首行缩进
            style.paragraph_format.space_after = Pt(6)

        # 代码样式
        if 'CodeStyle' not in styles:
            style = styles.add_style('CodeStyle', WD_STYLE_TYPE.CHARACTER)
            style.font.name = 'Consolas'
            style.font.size = Pt(10)
            style.font.color.rgb = RGBColor(0, 128, 0)

    def add_cover_page(self, title, subtitle=None, author=None, date=None):
        """添加封面页"""
        # 添加分节符(新页)
        self.doc.add_page_break()

        # 垂直居中计算
        para = self.doc.add_paragraph()
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        para.paragraph_format.space_before = Cm(10)

        # 主标题
        run = para.add_run(title)
        run.font.name = '黑体'
        run.font.size = Pt(28)
        run.font.bold = True
        run.font.color.rgb = RGBColor(0, 51, 102)

        if subtitle:
            para = self.doc.add_paragraph()
            para.alignment = WD_ALIGN_PARAGRAPH.CENTER
            run = para.add_run(subtitle)
            run.font.size = Pt(14)
            run.font.italic = True

        # 添加空行
        for _ in range(15):
            self.doc.add_paragraph()

        # 作者信息
        if author or date:
            para = self.doc.add_paragraph()
            para.alignment = WD_ALIGN_PARAGRAPH.RIGHT

            if author:
                run = para.add_run(f"作者:{author}")
                run.font.size = Pt(12)

            if date:
                para = self.doc.add_paragraph()
                para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
                run = para.add_run(f"日期:{date}")
                run.font.size = Pt(12)

        # 添加分页符
        self.doc.add_page_break()

    def add_table_of_contents(self):
        """添加目录"""
        # 添加目录标题
        para = self.doc.add_paragraph("目 录")
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.runs[0]
        run.font.size = Pt(16)
        run.font.bold = True

        self.doc.add_paragraph()

        # 在实际应用中,这里可以自动生成目录
        # 注意:python-docx 需要手动管理目录或使用字段代码
        # 这里演示手动添加
        toc_items = [
            ("1. 引言", 1),
            ("2. 技术实现", 1),
            ("   2.1 环境配置", 2),
            ("   2.2 代码实现", 2),
            ("3. 结果分析", 1),
            ("4. 结论", 1)
        ]

        for text, level in toc_items:
            para = self.doc.add_paragraph()
            if level == 2:
                para.paragraph_format.left_indent = Cm(0.74)

            run = para.add_run(text)
            run.font.size = Pt(12)

        # 添加分页符
        self.doc.add_page_break()

    def add_formatted_table(self, headers, data, 
                           table_style='LightGrid',
                           header_bg_color=None):
        """添加格式化表格"""

        # 创建表格
        table = self.doc.add_table(rows=1, cols=len(headers))
        table.style = table_style
        table.alignment = WD_TABLE_ALIGNMENT.CENTER

        # 设置表头
        header_cells = table.rows[0].cells
        for i, header in enumerate(headers):
            header_cells[i].text = header
            # 表头格式
            para = header_cells[i].paragraphs[0]
            para.runs[0].bold = True
            para.alignment = WD_ALIGN_PARAGRAPH.CENTER

            if header_bg_color:
                shading = header_cells[i]._element.xpath('.//w:shd')[0]
                shading.set(qn('w:fill'), header_bg_color)

        # 添加数据行
        for row_data in data:
            row_cells = table.add_row().cells
            for i, cell_data in enumerate(row_data):
                row_cells[i].text = str(cell_data)
                # 居中显示
                row_cells[i].paragraphs[0].alignment = WD_ALIGN_PARAGRAPH.CENTER

        self.doc.add_paragraph()  # 添加空行

    def add_code_block(self, code, language='python'):
        """添加代码块"""
        para = self.doc.add_paragraph()
        para.style = 'CodeStyle'

        # 添加代码
        run = para.add_run(f"```{language}\n{code}\n```")
        run.font.name = 'Consolas'
        run.font.size = Pt(10)
        run.font.color.rgb = RGBColor(0, 128, 0)

        # 设置背景色
        shading = para._element.xpath('.//w:shd')
        if not shading:
            shd = para._element.makeelement(qn('w:shd'))
            para._element.append(shd)
        shading = para._element.xpath('.//w:shd')[0]
        shading.set(qn('w:fill'), 'F0F0F0')  # 浅灰色背景

        self.doc.add_paragraph()

    def add_footer(self, text="页脚内容"):
        """添加页脚"""
        section = self.doc.sections[0]
        footer = section.footer

        # 清除默认内容
        for paragraph in footer.paragraphs:
            p = paragraph._element
            p.getparent().remove(p)

        # 添加自定义页脚
        para = footer.add_paragraph()
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        run = para.add_run(text)
        run.font.size = Pt(9)
        run.font.color.rgb = RGBColor(128, 128, 128)

    def add_header(self, text="页眉内容"):
        """添加页眉"""
        section = self.doc.sections[0]
        header = section.header

        # 清除默认内容
        for paragraph in header.paragraphs:
            p = paragraph._element
            p.getparent().remove(p)

        # 添加自定义页眉
        para = header.add_paragraph()
        para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
        run = para.add_run(text)
        run.font.size = Pt(9)

    def save(self, filename):
        """保存文档"""
        self.doc.save(filename)
        print(f"文档已保存:{filename}")

# 使用示例
def create_complete_document():
    """创建完整排版文档的示例"""

    formatter = WordAutoFormatter()

    # 1. 添加封面
    formatter.add_cover_page(
        title="Python自动化排版系统",
        subtitle="技术文档与实现",
        author="智能助手",
        date=datetime.now().strftime("%Y年%m月%d日")
    )

    # 2. 添加目录
    formatter.add_table_of_contents()

    # 3. 添加页眉页脚
    formatter.add_header("Python自动化排版系统")
    formatter.add_footer(f"第 {{{{页码}}}} 页")

    # 4. 添加章节
    formatter.doc.add_heading("1. 引言", level=1)
    para = formatter.doc.add_paragraph(
        "本文档展示了使用Python实现Word文档自动化排版的完整流程。"
        "通过python-docx库,我们可以实现文档创建、格式设置、内容插入等"
        "自动化操作,大大提高文档处理的效率。"
    )
    para.style = 'CustomBody'

    # 5. 添加表格
    headers = ['姓名', '年龄', '部门', '职位']
    data = [
        ['张三', '28', '技术部', '工程师'],
        ['李四', '32', '市场部', '经理'],
        ['王五', '25', '人事部', '专员']
    ]
    formatter.add_formatted_table(headers, data, header_bg_color='D9EAD3')

    # 6. 添加代码示例
    code_example = '''def hello_world():
    """简单的Python函数"""
    print("Hello, World!")
    return True'''

    formatter.add_code_block(code_example, 'python')

    # 7. 添加图片(如果有图片文件)
    try:
        formatter.doc.add_picture('example.png', width=Cm(10))
        para = formatter.doc.add_paragraph("图1:示例图片")
        para.alignment = WD_ALIGN_PARAGRAPH.CENTER
        para.runs[0].font.size = Pt(10)
        para.runs[0].italic = True
    except:
        pass  # 图片文件不存在时跳过

    # 8. 保存文档
    formatter.save('自动化排版示例.docx')

if __name__ == "__main__":
    create_complete_document()

四、批量处理功能

import glob
from pathlib import Path

class BatchWordProcessor:
    """批量Word文档处理器"""

    def __init__(self):
        self.doc = Document()

    def merge_documents(self, folder_path, output_file='merged.docx'):
        """合并多个Word文档"""
        word_files = glob.glob(f"{folder_path}/*.docx")

        for i, file_path in enumerate(word_files, 1):
            try:
                sub_doc = Document(file_path)
                # 添加文件名作为标题
                self.doc.add_heading(Path(file_path).stem, level=1)

                # 复制所有段落
                for para in sub_doc.paragraphs:
                    new_para = self.doc.add_paragraph()
                    for run in para.runs:
                        new_run = new_para.add_run(run.text)
                        # 复制格式
                        new_run.bold = run.bold
                        new_run.italic = run.italic
                        new_run.font.size = run.font.size

                if i < len(word_files):
                    self.doc.add_page_break()

            except Exception as e:
                print(f"处理文件 {file_path} 时出错: {e}")

        self.doc.save(output_file)
        print(f"已合并 {len(word_files)} 个文件到 {output_file}")

    def batch_format(self, folder_path, style_config):
        """批量格式化文档"""
        word_files = glob.glob(f"{folder_path}/*.docx")

        for file_path in word_files:
            try:
                doc = Document(file_path)
                self._apply_formatting(doc, style_config)

                # 保存为新文件
                new_path = file_path.replace('.docx', '_formatted.docx')
                doc.save(new_path)
                print(f"已格式化: {new_path}")

            except Exception as e:
                print(f"处理文件 {file_path} 时出错: {e}")

    def _apply_formatting(self, doc, config):
        """应用格式化配置"""
        # 示例配置
        for para in doc.paragraphs:
            if para.style.name.startswith('Heading'):
                # 设置标题格式
                for run in para.runs:
                    run.font.name = config.get('heading_font', '黑体')
                    run.font.size = Pt(config.get('heading_size', 16))
            else:
                # 设置正文格式
                for run in para.runs:
                    run.font.name = config.get('body_font', '宋体')
                    run.font.size = Pt(config.get('body_size', 12))

五、实用工具函数

def extract_document_info(doc_path):
    """提取文档信息"""
    doc = Document(doc_path)

    info = {
        'paragraphs': len(doc.paragraphs),
        'tables': len(doc.tables),
        'images': len(doc.inline_shapes),
        'pages': estimate_page_count(doc),
        'author': doc.core_properties.author,
        'created': doc.core_properties.created,
        'modified': doc.core_properties.modified
    }

    return info

def estimate_page_count(doc):
    """估算页数(python-docx不直接提供页数)"""
    total_chars = 0
    for para in doc.paragraphs:
        total_chars += len(para.text)

    # 简单估算:每页约3000字符
    return max(1, total_chars // 3000)

def replace_text_in_document(doc_path, replacements, output_path):
    """批量替换文本"""
    doc = Document(doc_path)

    for para in doc.paragraphs:
        for old_text, new_text in replacements.items():
            if old_text in para.text:
                # 替换整个段落中的文本
                para.text = para.text.replace(old_text, new_text)

    doc.save(output_path)

六、高级功能:模板系统

class TemplateSystem:
    """模板化文档生成系统"""

    def __init__(self, template_path):
        self.template = Document(template_path)
        self.placeholders = self._find_placeholders()

    def _find_placeholders(self):
        """查找模板中的占位符"""
        placeholders = []
        for para in self.template.paragraphs:
            if '{{' in para.text and '}}' in para.text:
                # 提取占位符
                import re
                matches = re.findall(r'\{\{(.*?)\}\}', para.text)
                placeholders.extend(matches)
        return list(set(placeholders))

    def fill_template(self, data_dict, output_path):
        """填充模板数据"""
        doc = Document()

        for para in self.template.paragraphs:
            new_para = doc.add_paragraph()
            text = para.text

            # 替换占位符
            for key, value in data_dict.items():
                placeholder = f'{{{{{key}}}}}'
                if placeholder in text:
                    text = text.replace(placeholder, str(value))

            # 添加文本并保留格式
            new_para.add_run(text)

            # 复制格式
            new_para.style = para.style
            new_para.alignment = para.alignment

        doc.save(output_path)
        print(f"模板已填充并保存到: {output_path}")

# 使用模板系统
def use_template_system():
    # 1. 创建模板文档(手动创建或代码生成)
    template_doc = Document()

    # 添加带占位符的内容
    template_doc.add_heading('{{title}}', level=1)
    template_doc.add_paragraph('日期: {{date}}')
    template_doc.add_paragraph('作者: {{author}}')
    template_doc.add_paragraph('{{content}}')

    template_doc.save('report_template.docx')

    # 2. 使用模板
    template = TemplateSystem('report_template.docx')

    data = {
        'title': '月度报告',
        'date': '2024-01-15',
        'author': '张三',
        'content': '这是本月的详细报告内容...'
    }

    template.fill_template(data, '月度报告_202401.docx')

七、最佳实践建议

错误处理:始终添加异常处理 性能优化:处理大文档时考虑内存使用 兼容性:确保字体在所有目标系统可用 模板设计:预先设计好模板格式 版本控制:保持python-docx库版本稳定

八、注意事项

页眉页脚限制:python-docx对复杂页眉页脚支持有限 目录生成:需要手动管理或使用其他方法 格式继承:注意样式的继承关系 性能:处理大型文档时可能较慢

这个完整的自动化排版系统可以用于生成报告、文档、合同等各种标准化文档,大大提高工作效率。你可以根据具体需求调整和扩展这些功能。