一、简介
“Apache POI”是一个用于操作Microsoft Office格式文件(如Word文档、Excel表格、PowerPoint演示文稿)的开源Java库。POI允许开发人员读取和写入这些Office文件,使得Java应用程序能够与Microsoft Office文件进行交互。
这里介绍一下POI解析doc与docx文件的demo。
二、doc文档解析
private String parseDoc(String fileUrl, String savePath, String encode) throws Exception{
ByteArrayOutputStream out = null;
try {
HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(savePath));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager( new PicturesManager() {
// 此处返回的图片的路径是html中图片的访问路径
@Override
public String savePicture(byte[] content,
PictureType pictureType, String suggestedName,
float widthInches, float heightInches )
{
// 此处将图片保存到本地,用于后续上传到nos中
return PIC_SAVE_PATH + "/" + suggestedName;
}
});
wordToHtmlConverter.processDocument(wordDocument);
List pics=wordDocument.getPicturesTable().getAllPictures();
if(pics!=null){
for(int i=0;i<pics.size();i++){
Picture pic = (Picture)pics.get(i);
pic.writeImageContent(new FileOutputStream(PIC_SAVE_PATH + "/" + pic.suggestFullFileName()));
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();
out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(out);
TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, encode);
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
// 替换图片路径
String htmlContent = replacedPicContent(new String(out.toByteArray()));
return htmlContent;
} catch (Exception e) {
logger.info("traverse Doc fail. fileUrl:{}, savePath:{}, stackTrace:{}", fileUrl, savePath, e.getStackTrace());
return "";
} finally {
IOUtils.close(out);
}
}
其中的PIC_SAVE_PATH是我们自己定义的本地图片保存路径。
三、docx文档解析
private String parseDocx(String fileUrl, String savePath, String encode) throws Exception {
String redisKey = DigestUtils.md5Hex(fileUrl);
String parsingPath = FILE_PARSING_RES_PATH + "/" + redisKey + ".html";
FileInputStream fis = null;
OutputStream htmlOutputStream = null;
try {
fis = new FileInputStream(new File(savePath));
XWPFDocument document = new XWPFDocument(fis);
// 提取并保存图片
for (XWPFPictureData picture : document.getAllPictures()) {
byte[] pictureData = picture.getData();
String imageFileName = picture.getFileName();
FileOutputStream imageOutFile = new FileOutputStream(PIC_SAVE_PATH + "/" + imageFileName);
imageOutFile.write(pictureData);
imageOutFile.close();
}
// 将文档内容转换为HTML
File htmlOutputFile = new File(parsingPath);
htmlOutputStream = new FileOutputStream(htmlOutputFile);
XHTMLConverter.getInstance().convert(document, htmlOutputStream, null);
// 修改图片路径,默认会存在/word/media/下,我们将其修改为固定路径
String htmlContent = readFileToString(htmlOutputFile, encode);
htmlContent = htmlContent.replaceAll("src=\"word/media", "src=\"" + PIC_SAVE_PATH);
String resContent = replacedPicContent(htmlContent);
return resContent;
} catch (Exception e) {
logger.info("traverse Docx fail. fileUrl:{}, savePath:{}, stackTrace:{}", fileUrl, savePath, e.getStackTrace());
return "";
} finally {
IOUtils.close(htmlOutputStream);
IOUtils.close(fis);
}
}
四、图片路径替换
这里由于业务需要,将存在本地的图片需要上传到NOS中,因此这里案例是替换了nos的地址,具体的需求具体分析。
// 需要html中图片的正则
public static final String PIC_REG= "<img\\s+src=\"([^\"]+)\"";
private static String replacedPicContent(String orginHtmlContent) {
// 使用正则表达式提取img标签中的src属性值
Pattern pattern = Pattern.compile(PIC_REG);
Matcher matcher = pattern.matcher(orginHtmlContent);
StringBuffer result = new StringBuffer();
// 每找到一张图片,则上传到nos,并替换html中的图片路径为nos路径地址
while (matcher.find()){
String fileName = matcher.group(1);
// 获取本地图片路径
File file = new File(fileName);
String nosUrl = "";
try {
// 图片上传后返回的nos地址,用于替换html中的图片路径
nosUrl = uploadPic(fileName, file);
} catch (Exception e) {
throw new RuntimeException(e);
}
matcher.appendReplacement(result, "<img src=\"" + nosUrl + "\"");
}
// 将替换后的内容追加到result中
matcher.appendTail(result);
orginHtmlContent = result.toString();
return orginHtmlContent;
}
以上就解决了doc/docx文件转化为HTML的需求。具体效果如下:
可以看到基本的格式还都是保留的。效果还不错~
本文含有隐藏内容,请 开通VIP 后查看