c# openxml 打开加密 的word读取内容

发布于:2025-07-28 ⋅ 阅读:(16) ⋅ 点赞:(0)
using System;
using System.IO;
using System.Linq;
using System.Text;
using DocumentFormat.OpenXml;
using DocumentFormat.OpenXml.Packaging;
using DocumentFormat.OpenXml.Wordprocessing;

/// <summary>
/// 使用OpenXML获取文档内容,替代Aspose方式
/// </summary>
/// <param name="path">文档路径</param>
/// <param name="password">密码</param>
/// <returns>文档内容字符串</returns>
public static string GetWordContentByOpenXml(string path, string password)
{
    try
    {
        using (var document = WordprocessingDocument.Open(path, false, new OpenSettings()
        {
            Password = password
        }))
        {
            if (document.MainDocumentPart?.Document?.Body == null)
                return null;

            // 创建StringBuilder来存储文档主体内容
            var contentBuilder = new StringBuilder();
            
            // 获取文档主体,排除页眉页脚
            var body = document.MainDocumentPart.Document.Body;
            
            // 提取主文档内容(不包括页眉页脚)
            ExtractBodyContent(body, contentBuilder);
            
            // 获取原始内容
            string contentWithoutHeaderFooter = contentBuilder.ToString();
            
            // 应用内容清理和格式化
            string content = CleanContent(contentWithoutHeaderFooter);
            
            // 处理特定的截取逻辑
            int index = content.LastIndexOf("限公司第");
            if (index > 0)
            {
                return content.Substring(0, index).Trim();
            }
            else
            {
                return content;
            }
        }
    }
    catch (Exception ex)
    {
        LogManager.WriteError("GetWordContentByOpenXml()", ex.StackTrace?.ToString());
        return null;
    }
}

/// <summary>
/// 提取文档主体内容,排除页眉页脚
/// </summary>
/// <param name="body">文档主体</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractBodyContent(Body body, StringBuilder contentBuilder)
{
    // 遍历文档主体中的所有元素
    foreach (var element in body.Elements())
    {
        ExtractElementContent(element, contentBuilder);
    }
}

/// <summary>
/// 递归提取元素内容
/// </summary>
/// <param name="element">OpenXML元素</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractElementContent(OpenXmlElement element, StringBuilder contentBuilder)
{
    switch (element)
    {
        case Paragraph paragraph:
            ExtractParagraphContent(paragraph, contentBuilder);
            contentBuilder.AppendLine(); // 段落后换行
            break;
            
        case Table table:
            ExtractTableContent(table, contentBuilder);
            break;
            
        case SectionProperties _:
            // 跳过节属性,这些通常包含页眉页脚引用
            break;
            
        default:
            // 递归处理其他容器元素
            foreach (var childElement in element.Elements())
            {
                ExtractElementContent(childElement, contentBuilder);
            }
            break;
    }
}

/// <summary>
/// 提取段落内容
/// </summary>
/// <param name="paragraph">段落元素</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractParagraphContent(Paragraph paragraph, StringBuilder contentBuilder)
{
    foreach (var run in paragraph.Elements<Run>())
    {
        foreach (var text in run.Elements<Text>())
        {
            contentBuilder.Append(text.Text);
        }
        
        // 处理制表符
        foreach (var tab in run.Elements<TabChar>())
        {
            contentBuilder.Append("\t");
        }
        
        // 处理换行符
        foreach (var br in run.Elements<Break>())
        {
            contentBuilder.AppendLine();
        }
    }
}

/// <summary>
/// 提取表格内容
/// </summary>
/// <param name="table">表格元素</param>
/// <param name="contentBuilder">内容构建器</param>
private static void ExtractTableContent(Table table, StringBuilder contentBuilder)
{
    foreach (var row in table.Elements<TableRow>())
    {
        foreach (var cell in row.Elements<TableCell>())
        {
            foreach (var paragraph in cell.Elements<Paragraph>())
            {
                ExtractParagraphContent(paragraph, contentBuilder);
            }
            contentBuilder.Append("\t"); // 单元格间用制表符分隔
        }
        contentBuilder.AppendLine(); // 表格行后换行
    }
}

/// <summary>
/// 清理和格式化内容,模拟Aspose的清理功能
/// </summary>
/// <param name="content">原始内容</param>
/// <returns>清理后的内容</returns>
private static string CleanContent(string content)
{
    if (string.IsNullOrEmpty(content))
        return string.Empty;

    // 移除多余的空白字符(模拟Tool.TrimAll功能)
    content = System.Text.RegularExpressions.Regex.Replace(content, @"\s+", " ");
    content = content.Trim();
    
    // 移除多余的换行符
    content = System.Text.RegularExpressions.Regex.Replace(content, @"\n\s*\n", "\n");
    
    // 移除Aspose评估版本的水印文本(虽然OpenXML不会有,但保持兼容性)
    content = content.Replace("EvaluationOnly.CreatedwithAspose.Words.Copyright2003-2024AsposePtyLtd.", "");
    
    // 移除其他可能的控制字符
    content = System.Text.RegularExpressions.Regex.Replace(content, @"[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]", "");
    
    return content.Trim();
}

/// <summary>
/// 检查文档是否需要密码
/// </summary>
/// <param name="path">文档路径</param>
/// <returns>是否需要密码</returns>
public static bool IsPasswordRequired(string path)
{
    try
    {
        using (var document = WordprocessingDocument.Open(path, false))
        {
            // 如果能正常打开,说明不需要密码
            return false;
        }
    }
    catch (OpenXmlPackageException ex)
    {
        // 如果抛出密码相关异常,说明需要密码
        return ex.Message.Contains("password") || ex.Message.Contains("encrypted") || ex.Message.Contains("protected");
    }
    catch
    {
        // 其他异常可能也表示需要密码
        return true;
    }
}

/// <summary>
/// 增强版本:支持更多文档处理选项
/// </summary>
/// <param name="path">文档路径</param>
/// <param name="password">密码</param>
/// <param name="includeHyperlinks">是否包含超链接文本</param>
/// <param name="includeFootnotes">是否包含脚注</param>
/// <returns>文档内容</returns>
public static string GetWordContentByOpenXmlAdvanced(string path, string password, bool includeHyperlinks = false, bool includeFootnotes = false)
{
    try
    {
        using (var document = WordprocessingDocument.Open(path, false, new OpenSettings()
        {
            Password = password
        }))
        {
            if (document.MainDocumentPart?.Document?.Body == null)
                return null;

            var contentBuilder = new StringBuilder();
            var body = document.MainDocumentPart.Document.Body;
            
            // 提取主文档内容
            ExtractBodyContentAdvanced(body, contentBuilder, includeHyperlinks);
            
            // 如果需要包含脚注
            if (includeFootnotes && document.MainDocumentPart.FootnotesPart != null)
            {
                ExtractFootnotesContent(document.MainDocumentPart.FootnotesPart, contentBuilder);
            }
            
            string contentWithoutHeaderFooter = contentBuilder.ToString();
            string content = CleanContent(contentWithoutHeaderFooter);
            
            // 应用特定的截取逻辑
            int index = content.LastIndexOf("公司第");
            if (index > 0)
            {
                return content.Substring(0, index).Trim();
            }
            else
            {
                return content;
            }
        }
    }
    catch (Exception ex)
    {
        LogManager.WriteError("GetWordContentByOpenXmlAdvanced()", ex.StackTrace?.ToString());
        return null;
    }
}

/// <summary>
/// 高级内容提取,支持超链接等
/// </summary>
private static void ExtractBodyContentAdvanced(Body body, StringBuilder contentBuilder, bool includeHyperlinks)
{
    foreach (var element in body.Elements())
    {
        if (element is Paragraph paragraph)
        {
            ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);
            contentBuilder.AppendLine();
        }
        else if (element is Table table)
        {
            ExtractTableContentAdvanced(table, contentBuilder, includeHyperlinks);
        }
        else if (!(element is SectionProperties))
        {
            // 递归处理其他元素
            foreach (var childElement in element.Elements())
            {
                ExtractBodyContentAdvanced(new Body(childElement), contentBuilder, includeHyperlinks);
            }
        }
    }
}

/// <summary>
/// 高级段落内容提取
/// </summary>
private static void ExtractParagraphContentAdvanced(Paragraph paragraph, StringBuilder contentBuilder, bool includeHyperlinks)
{
    foreach (var element in paragraph.Elements())
    {
        if (element is Run run)
        {
            foreach (var text in run.Elements<Text>())
            {
                contentBuilder.Append(text.Text);
            }
        }
        else if (element is Hyperlink hyperlink && includeHyperlinks)
        {
            foreach (var run2 in hyperlink.Elements<Run>())
            {
                foreach (var text in run2.Elements<Text>())
                {
                    contentBuilder.Append(text.Text);
                }
            }
        }
    }
}

/// <summary>
/// 高级表格内容提取
/// </summary>
private static void ExtractTableContentAdvanced(Table table, StringBuilder contentBuilder, bool includeHyperlinks)
{
    foreach (var row in table.Elements<TableRow>())
    {
        foreach (var cell in row.Elements<TableCell>())
        {
            foreach (var paragraph in cell.Elements<Paragraph>())
            {
                ExtractParagraphContentAdvanced(paragraph, contentBuilder, includeHyperlinks);
            }
            contentBuilder.Append("\t");
        }
        contentBuilder.AppendLine();
    }
}

/// <summary>
/// 提取脚注内容
/// </summary>
private static void ExtractFootnotesContent(FootnotesPart footnotesPart, StringBuilder contentBuilder)
{
    if (footnotesPart.Footnotes != null)
    {
        contentBuilder.AppendLine("\n--- 脚注 ---");
        foreach (var footnote in footnotesPart.Footnotes.Elements<Footnote>())
        {
            foreach (var paragraph in footnote.Elements<Paragraph>())
            {
                ExtractParagraphContent(paragraph, contentBuilder);
                contentBuilder.AppendLine();
            }
        }
    }
}

网站公告

今日签到

点亮在社区的每一天
去签到