POI 使用 HWPFDocument 和 XWPFDocument 分割 word2003 和 word2010示例

流曲頻陽 2016-12-05

展開全文

[摘要：本身正在做項目過程當(dāng)中用到懂得析支解word2003戰(zhàn)2007,2010等文件內(nèi)容，以下是代碼：上面是局部代碼，若是念要全體代碼能夠收我郵箱，yongqian.liu@peraglobal.com，接心類：PoiExtractContent.java]

自己在做項目過程中用到了解析分割word2003和2007,2010等文件內(nèi)容，以下是代碼：

下面是部分代碼，如果想要全部代碼可以發(fā)我郵箱，yongqian.liu@peraglobal.com，

接口類：PoiExtractContent.java

package com.peraglobal.extract.poi;

import java.util.Map;

/**
* 使用 POI 解析 WORD 文件的內(nèi)容信息
* @author yongqian.liu
* 2015-2-9
*/
public interface PoiExtractContent<T> {

/**
* 根據(jù)文件路徑獲得 Document 對象
* @param docPath 路徑
* @return Document
*/
public T getDocument(String docPath);

/**
* 解析 word 文檔的標題
* @param doc Document 對象
* @return word 文檔中標題
*/
public String getTilte(T doc);

/**
* 獲取 word 文檔里所有文字內(nèi)容（不包括圖片、表格等格式的內(nèi)容）
* @param doc Document 對象
* @return word 文檔中文字部分全部內(nèi)容
*/
public String getContent(T doc);

/**
* 獲取 word 文檔里所有文字內(nèi)容（不包括圖片、表格等格式的內(nèi)容）
* @param docPath doc 對象路徑
* @return word 文檔中文字部分全部內(nèi)容
*/
public String getContent(String docPath);

/**
* 獲取 word 文檔里面所有圖片并另存到指定目錄下
* @param doc Document 對象
* @param picPath 保存圖片路徑
* @param suffix 后綴名
*/
public void getPictures(T doc, String picPath, String suffix);

/**
* 獲取word 文檔里面所有表格
* @param doc Document 對象
*/
public void getTables(T doc);

/**
* 獲取word 文檔中最大的字體
* @param doc Document 對象
* @return 最大字體
*/
public int getMaxFontSize(T doc);

/**
* 獲取 word 文檔的創(chuàng)建信息
* @param docPath doc路徑
* @return 創(chuàng)建文檔的信息
*/
public Map<String, String> getInfo(String docPath);

}

2003實現(xiàn)：PoiHwpfExtractContentImpl.java

package com.peraglobal.extract.poi.impl;

import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableIterator;
import org.apache.poi.hwpf.usermodel.TableRow;

import com.peraglobal.extract.poi.PoiExtractContent;
import com.peraglobal.extract.util.Const;
import com.peraglobal.extract.util.FormatTextUtil;

/**
* 使用 POI 解析 DOC2003 文件的內(nèi)容信息
* @author yongqian.liu
* 2015-2-9
*/
public class PoiHwpfExtractContentImpl implements PoiExtractContent<HWPFDocument> {

/**
* 根據(jù)文件路徑獲得 Document 對象
* @param docPath 路徑
* @return Document
*/
public HWPFDocument getDocument(String docPath) {
// hwpfDocument 是專門處理 word 的，在 poi 中還有處理其他 office 文檔的類
HWPFDocument doc = null;
try {
doc = new HWPFDocument(new FileInputStream(docPath));
} catch (Exception e) {
e.printStackTrace();
}
return doc;
}

/**
* 解析 word 文檔的標題
* @param doc Document 對象
* @return 標題
*/
public String getTilte(HWPFDocument doc) {
String title = "\";
Range range = doc.getRange();
Paragraph p = null;
for (int i = 0; i < range.numParagraphs(); i++) {
p = range.getParagraph(i);
if(p.text() != null && !p.text().equals("") && !p.text().equals("r")){
title = p.text().trim();
break;
}
}
return title;
}

/**
* 獲取 word 文檔里所有文字內(nèi)容（不包括圖片、表格等格式的內(nèi)容）
* @param doc Document 對象
* @return word 文檔中文字部分全部內(nèi)容
*/
public String getContent(HWPFDocument doc){
String content = "";
try {
content = doc.getText().toString().trim();
} catch (Exception e) {
e.printStackTrace();
}
return content.replaceAll("", "");
}

/**
* 獲取 word 文檔里所有文字內(nèi)容（不包括圖片、表格等格式的內(nèi)容）
* @param docPath doc 對象路徑
* @return word 文檔中文字部分全部內(nèi)容
*/
public String getContent(String docPath) {
StringBuffer strBuff = new StringBuffer("");
try {
WordExtractor extractor = new WordExtractor(new FileInputStream(docPath));
//extractor.getTextFromPieces();
String [] strArray = extractor.getParagraphText();
for(int i = 0; i < strArray.length; ++i) {
strBuff.append(strArray[i].trim());
}
} catch (Exception e) {
e.printStackTrace();
}
return strBuff.toString().replaceAll("", "");
}

/**
* 獲取 word 文檔里面所有圖片并另存到指定目錄下
* @param doc Document 對象
* @param picPath 保存圖片路徑
* @param suffix 后綴名
*/
public void getPictures(HWPFDocument doc, String picPath, String suffix) {
Range range = doc.getRange();
byte[] dataStream = doc.getDataStream();
int numChar = range.numCharacterRuns();
PicturesTable pTable = new PicturesTable(doc, dataStream, dataStream);
for (int i = 0; i < numChar; ++i) {
CharacterRun cuRun = range.getCharacterRun(i);
boolean hasPic = pTable.hasPicture(cuRun);
if (hasPic) {
Picture picture = pTable.extractPicture(cuRun, true);
try {
picture.writeImageContent(new FileOutputStream(picPath + i + suffix));
} catch (Exception e) {
e.printStackTrace();
}
}
}
}

/**
* 獲取word 文檔里面所有表格
* @param doc Document 對象
*/
public void getTables(HWPFDocument doc){
Range range = doc.getRange();
TableIterator tableIt = new TableIterator(range);
while (tableIt.hasNext()) {
Table table = (Table)tableIt.next();
for(int j=0;j<table.numRows();j++){
TableRow tr = table.getRow(j);
String content = "";
for(int i=0;i<tr.numCells();i++){
TableCell cell = tr.getCell(i);
for(int m=0;m<cell.numParagraphs();m++){ //獲取單元格內(nèi)容
Paragraph para = cell.getParagraph(m);
content += para.text().trim() + ";";
}
}
System.out.println(content);
}
}
}

/**
* 獲取文章中所有標題集合
* @param doc Document
* @return
*/
public List<String> getTitleList(HWPFDocument doc){

Range range = doc.getRange();
byte[] dataStream = doc.getDataStream();
int numP = range.numParagraphs();
List<String> titleList = new ArrayList<String>();

PicturesTable pTable = new PicturesTable(doc, dataStream, dataStream);
for(int i=0;i<numP;i++){
Range curRange = range.getParagraph(i);
Paragraph paragraph = range.getParagraph(i);
CharacterRun cr = curRange.getCharacterRun(0);
if(pTable.hasPicture(cr)){ //圖片
continue;
}else{
char currentChar = 0;
for(int k=0;k<cr.text().length();k++){
currentChar = cr.text().charAt(k);
if(currentChar != Const.SPACE_ASCII){
break;
}
}

if(currentChar == Const.ENTER_ASCII){ //回車符
continue;
}else if(currentChar == Const.SPACE_ASCII){ //空格符
continue;
}else if(currentChar == Const.TABULATION_ASCII){ //水平制表符
continue;
}
}

int numStyles = doc.getStyleSheet().numStyles();
int styleIndex = paragraph.getStyleIndex();
if (numStyles > styleIndex) {
StyleSheet style_sheet = doc.getStyleSheet();
StyleDescription style = style_sheet.getStyleDescription(styleIndex);
String styleName = style.getName();

if(styleName!=null&&styleName.contains("標題")){
titleList.add(paragraph.text().trim());
System.out.println(paragraph.text().trim());
}
}
}
return titleList;
}

/**
* 獲取整篇文章中所有標題樣式名稱
* @param doc Document
* @return
*/
public Set<String> getTitleStyleNameSet(HWPFDocument doc){
Range range = doc.getRange();
byte[] dataStream = doc.getDataStream();
int numP = range.numParagraphs();
Set<String> titNameSet = new HashSet<String>();

PicturesTable pTable = new PicturesTable(doc, dataStream, dataStream);
for(int i=0;i<numP;i++){
Range curRange = range.getParagraph(i);
Paragraph paragraph = range.getParagraph(i);
CharacterRun cr = curRange.getCharacterRun(0);
if(pTable.hasPicture(cr)){ //圖片
continue;
}else{
char currentChar = 0;
for(int k=0;k<cr.text().length();k++){
currentChar = cr.text().charAt(k);
if(currentChar != Const.SPACE_ASCII){
break;
}
}
if(currentChar == Const.ENTER_ASCII){ //回車符
continue;
}else if(currentChar == Const.SPACE_ASCII){ //空格符
continue;
}else if(currentChar == Const.TABULATION_ASCII){ //水平制表符
continue;
}
}
int numStyles = doc.getStyleSheet().numStyles();
int styleIndex = paragraph.getStyleIndex();
if (numStyles > styleIndex) {
StyleSheet style_sheet = doc.getStyleSheet();
StyleDescription style = style_sheet.getStyleDescription(styleIndex);
String styleName = style.getName();
if(styleName!=null&&styleName.contains("標題")){
if(styleName.contains(",")){
styleName = getFirstStyleName(styleName);
}
titNameSet.add(styleName);
}
}
}
return titNameSet;
}

/**
* 處理標題樣式名稱的特殊格式，如：“標題 3,標題 3 Char,標題 3 Char Char” ,只獲取“標題 3”
* @param styleName 需進行處理的標題樣式，如"標題 3,標題 3 Char,標題 3 Char Char”
* @return
*/
private String getFirstStyleName(String styleName){
if ((styleName != null) && (styleName.length() > 0)) {
int styleLeng = styleName.split(",").length;
if(styleLeng>1){
int comma = styleName.indexOf(",");
if(comma>-1&&(comma<styleName.length())){
return styleName.substring(0,comma);
}
}
}
return styleName;
}

/**
* 獲取當(dāng)前文章中最大標題樣式名稱，如“標題1”
* @param doc Document
* @return
*/
public String getMaxTitleStyleName(HWPFDocument doc){
Set<String> titNameSet = getTitleStyleNameSet(doc);
Iterator<String> it = titNameSet.iterator();
List<Integer> tempLst = new ArrayList<Integer>();
while(it.hasNext()){
String titName = it.next(); //得到“標題 1”、“標題 2”
try {
int curStyleName = Integer.parseInt(titName.substring(2).trim());
tempLst.add(curStyleName);
} catch (NumberFormatException e) {
continue;
}
}
int max = (tempLst.size()==0?0:tempLst.get(0));
for(int i=0;i<tempLst.size();i++){
int curSize = tempLst.get(i);
if(curSize<max){
max = curSize;
}
}
if(max==0){
return ""; //文章中不包含任何標題
}
return "標題 "+max;
}

/**
* 獲取word 文檔中最大的字體
* @param doc Document 對象
*/
public int getMaxFontSize(HWPFDocument doc) {
int fontSize = 0;
try {
Range range = doc.getRange();
for (int i = 0; i < range.numParagraphs(); i++) {
Paragraph poiPara = range.getParagraph(i);
int j = 0;
while (true) {
CharacterRun run = poiPara.getCharacterRun(j++);
if(fontSize < run.getFontSize()) {
fontSize = run.getFontSize();
}//字體大小
if (run.getEndOffset() == poiPara.getEndOffset()) {
break;
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return fontSize;
}

/**
* 獲取 word 文檔的創(chuàng)建信息
* @param docPath doc路徑
* @return 創(chuàng)建文檔的信息
*/
public Map<String, String> getInfo(String docPath) {
try {
InputStream is = new FileInputStream(docPath);
WordExtractor extractor = new WordExtractor(is);
SummaryInformation info = extractor.getSummaryInformation();
Map<String, String> mapInfo = new HashMap<String, String>();
mapInfo.put("author", info.getAuthor()); // 作者
mapInfo.put("title", info.getTitle()); // 標題
mapInfo.put("subject", info.getSubject()); // 主題
mapInfo.put("keyword", info.getKeywords()); // 關(guān)鍵詞
mapInfo.put("createdate", FormatTextUtil.dateFormat(info.getCreateDateTime())); // 創(chuàng)建時間
mapInfo.put("updatedate", FormatTextUtil.dateFormat(info.getLastSaveDateTime())); // 修改時間
} catch (Exception e) {
}
return null;
}
}

2007實現(xiàn)類：PoiXwpfExtractContentImpl.java

package com.peraglobal.extract.poi.impl;

import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.POIXMLProperties.CoreProperties;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;

import com.peraglobal.extract.poi.PoiExtractContent;
import com.peraglobal.extract.util.FormatTextUtil;

/**
* 使用 POI 解析 DOCX2007 文件的內(nèi)容信息
* @author yongqian.liu
* 2015-2-9
*/
public class PoiXwpfExtractContentImpl implements PoiExtractContent<XWPFDocument> {

/**
* 根據(jù)文件路徑獲得 Document 對象
* @param docxPath 路徑
* @return Document
*/
public XWPFDocument getDocument(String docxPath) {
//xwpfDocument是專門處理word的，在poi中還有處理其他office文檔的類
XWPFDocument docx = null;
try {
OPCPackage pack = POIXMLDocument.openPackage(docxPath);
docx = new XWPFDocument(pack) ;
} catch (Exception e) {
e.printStackTrace();
}
return docx;
}

/**
* 解析 word 文檔的標題
* @param docx Document 對象
* @return word 文檔中標題
*/
public String getTilte(XWPFDocument docx) {
String title = "\";
List<XWPFParagraph> paras = docx.getParagraphs();
XWPFParagraph p = null;
for (int i = 0; i < paras.size(); i++) {
if(p.getText() != null && !p.getText().equals("") && !p.getText().equals("r")){
title = p.getText().trim();
break;
}
}
return title;
}

/**
* 獲取 word 文檔里所有文字內(nèi)容（不包括圖片、表格等格式的內(nèi)容）
* @param docx Document 對象
* @return word 文檔中文字部分全部內(nèi)容
*/
public String getContent(XWPFDocument docx) {
String content = "";
try {
List<XWPFParagraph> paras = docx.getParagraphs();
for (XWPFParagraph para : paras) {
content += para.getText().trim();
}
} catch (Exception e) {
e.printStackTrace();
}
return content.replaceAll("", "");
}

/**
* 獲取 word 文檔里所有文字內(nèi)容（不包括圖片、表格等格式的內(nèi)容）
* @param docxPath docx 對象路徑
* @return word 文檔中文字部分全部內(nèi)容
*/
public String getContent(String docxPath) {
String content = "";
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(docxPath);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
content += extractor.getText().trim();
} catch (Exception e) {
e.printStackTrace();
}
return content.replaceAll("", "");
}

/**
* 獲取 word 文檔里面所有圖片并另存到指定目錄下
* @param docx Document 對象
* @param picPath 保存圖片路徑
* @param suffix 后綴名
*/
public void getPictures(XWPFDocument docx, String picPath, String suffix){
List<XWPFPictureData> wpdList = docx.getAllPictures();
if(wpdList != null && wpdList.size() > 0){
for (int i = 0; i < wpdList.size(); i++) {
byte[] picByte = wpdList.get(i).getData(); //獲取圖片數(shù)據(jù)流
FileOutputStream fos = null;
try {
fos = new FileOutputStream(picPath + i + suffix);
} catch (FileNotFoundException e) {
e.printStackTrace();
}finally{
try {
fos.write(picByte);
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

/**
* 獲取word 文檔里面所有表格
* @param doc Document 對象
*/
public void getTables(XWPFDocument docx){
Iterator<XWPFTable> tableIt = docx.getTablesIterator();
while (tableIt.hasNext()) {
XWPFTable table = tableIt.next();
String rowInfo = "";
for(int j = 0; j < table.getRows().size(); j ++){
List<XWPFTableCell> cells = table.getRow(j).getTableCells(); // 獲得所有列
for (int k = 0; k < cells.size(); k++) {
rowInfo += cells.get(k).getText().trim() + ";";
}
}
System.out.println(rowInfo);
}
}

/**
* 獲取word 文檔中最大的字體
* @param doc Document 對象
*/
public int getMaxFontSize(XWPFDocument docx) {
int fontSize = 0;
/* List<XWPFParagraph> paraGraph = docx.getParagraphs();
for(XWPFParagraph para :paraGraph ){
List<XWPFRun> run = para.getRuns();
for(XWPFRun r : run){
int i = 0;
System.out.println("字體顏色："+r.getColor());
System.out.println("字體名稱:"+r.getFontFamily());
System.out.println("字體大小："+r.getFontSize());
System.out.println("Text:"+r.getText(i++));
System.out.println("粗體？："+r.isBold());
System.out.println("斜體？："+r.isItalic());
if(fontSize < r.getFontSize()){
fontSize = r.getFontSize();
}
}
}*/

return fontSize;
}

/**
* 獲取 word 文檔的創(chuàng)建信息
* @param docPath docx路徑
* @return 創(chuàng)建文檔的信息
*/
public Map<String, String> getInfo(String docxPath) {
try {
InputStream is = new FileInputStream(docxPath);
XWPFDocument docx = new XWPFDocument(is);
XWPFWordExtractor extractor = new XWPFWordExtractor(docx);
CoreProperties coreProps = extractor.getCoreProperties();
Map<String, String> mapInfo = new HashMap<String, String>();
//mapInfo.put("category", coreProps.getCategory()); //分類
mapInfo.put("author", coreProps.getCreator()); //創(chuàng)建者
mapInfo.put("title", coreProps.getTitle()); //標題
mapInfo.put("subject", coreProps.getSubject()); // 主題
mapInfo.put("keyword", coreProps.getKeywords()); // 關(guān)鍵詞
mapInfo.put("createdate", FormatTextUtil.dateFormat(coreProps.getCreated())); //創(chuàng)建時間
mapInfo.put("updatedate", FormatTextUtil.dateFormat(coreProps.getLastPrinted())); // 修改時間
} catch (Exception e) {
}
return null;
}
}

POI 使用 HWPFDocument 和 XWPFDocument 分割 word2003 和 word2010示例

相關(guān)推薦