[摘要:本身正在做項目過程當(dāng)中用到懂得析支解word2003戰(zhàn)2007,2010等文件內(nèi)容,以下是代碼: 上面是局部代碼,若是念要全體代碼能夠收我郵箱,yongqian.liu@peraglobal.com, 接心類 :PoiExtractContent.java]
自己在做項目過程中用到了解析分割word2003和2007,2010等文件內(nèi)容,以下是代碼: 下面是部分代碼,如果想要全部代碼可以發(fā)我郵箱,yongqian.liu@peraglobal.com, 接口類 :PoiExtractContent.java package com.peraglobal.extract.poi; import java.util.Map; /** * 使用 POI 解析 WORD 文件的內(nèi)容信息 * @author yongqian.liu * 2015-2-9 */ public interface PoiExtractContent<T> { /** * 根據(jù)文件路徑獲得 Document 對象 * @param docPath 路徑 * @return Document */ public T getDocument(String docPath); /** * 解析 word 文檔的標題 * @param doc Document 對象 * @return word 文檔中標題 */ public String getTilte(T doc); /** * 獲取 word 文檔里所有文字內(nèi)容(不包括圖片、表格等格式的內(nèi)容) * @param doc Document 對象 * @return word 文檔中文字部分全部內(nèi)容 */ public String getContent(T doc); /** * 獲取 word 文檔里所有文字內(nèi)容(不包括圖片、表格等格式的內(nèi)容) * @param docPath doc 對象路徑 * @return word 文檔中文字部分全部內(nèi)容 */ public String getContent(String docPath); /** * 獲取 word 文檔里面所有圖片并另存到指定目錄下 * @param doc Document 對象 * @param picPath 保存圖片路徑 * @param suffix 后綴名 */ public void getPictures(T doc, String picPath, String suffix); /** * 獲取word 文檔里面所有表格 * @param doc Document 對象 */ public void getTables(T doc); /** * 獲取word 文檔中最大的字體 * @param doc Document 對象 * @return 最大字體 */ public int getMaxFontSize(T doc); /** * 獲取 word 文檔的創(chuàng)建信息 * @param docPath doc路徑 * @return 創(chuàng)建文檔的信息 */ public Map<String, String> getInfo(String docPath); } 2003實現(xiàn):PoiHwpfExtractContentImpl.java package com.peraglobal.extract.poi.impl; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.InputStream; import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import org.apache.poi.hpsf.SummaryInformation; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.hwpf.model.PicturesTable; import org.apache.poi.hwpf.model.StyleDescription; import org.apache.poi.hwpf.model.StyleSheet; import org.apache.poi.hwpf.usermodel.CharacterRun; import org.apache.poi.hwpf.usermodel.Paragraph; import org.apache.poi.hwpf.usermodel.Picture; import org.apache.poi.hwpf.usermodel.Range; import org.apache.poi.hwpf.usermodel.Table; import org.apache.poi.hwpf.usermodel.TableCell; import org.apache.poi.hwpf.usermodel.TableIterator; import org.apache.poi.hwpf.usermodel.TableRow; import com.peraglobal.extract.poi.PoiExtractContent; import com.peraglobal.extract.util.Const; import com.peraglobal.extract.util.FormatTextUtil; /** * 使用 POI 解析 DOC2003 文件的內(nèi)容信息 * @author yongqian.liu * 2015-2-9 */ public class PoiHwpfExtractContentImpl implements PoiExtractContent<HWPFDocument> { /** * 根據(jù)文件路徑獲得 Document 對象 * @param docPath 路徑 * @return Document */ public HWPFDocument getDocument(String docPath) { // hwpfDocument 是專門處理 word 的,在 poi 中還有處理其他 office 文檔的類 HWPFDocument doc = null; try { doc = new HWPFDocument(new FileInputStream(docPath)); } catch (Exception e) { e.printStackTrace(); } return doc; } /** * 解析 word 文檔的標題 * @param doc Document 對象 * @return 標題 */ public String getTilte(HWPFDocument doc) { String title = "\"; Range range = doc.getRange(); Paragraph p = null; for (int i = 0; i < range.numParagraphs(); i++) { p = range.getParagraph(i); if(p.text() != null && !p.text().equals("") && !p.text().equals("r")){ title = p.text().trim(); break; } } return title; } /** * 獲取 word 文檔里所有文字內(nèi)容(不包括圖片、表格等格式的內(nèi)容) * @param doc Document 對象 * @return word 文檔中文字部分全部內(nèi)容 */ public String getContent(HWPFDocument doc){ String content = ""; try { content = doc.getText().toString().trim(); } catch (Exception e) { e.printStackTrace(); } return content.replaceAll("", ""); } /** * 獲取 word 文檔里所有文字內(nèi)容(不包括圖片、表格等格式的內(nèi)容) * @param docPath doc 對象路徑 * @return word 文檔中文字部分全部內(nèi)容 */ public String getContent(String docPath) { StringBuffer strBuff = new StringBuffer(""); try { WordExtractor extractor = new WordExtractor(new FileInputStream(docPath)); //extractor.getTextFromPieces(); String [] strArray = extractor.getParagraphText(); for(int i = 0; i < strArray.length; ++i) { strBuff.append(strArray[i].trim()); } } catch (Exception e) { e.printStackTrace(); } return strBuff.toString().replaceAll("", ""); } /** * 獲取 word 文檔里面所有圖片并另存到指定目錄下 * @param doc Document 對象 * @param picPath 保存圖片路徑 * @param suffix 后綴名 */ public void getPictures(HWPFDocument doc, String picPath, String suffix) { Range range = doc.getRange(); byte[] dataStream = doc.getDataStream(); int numChar = range.numCharacterRuns(); PicturesTable pTable = new PicturesTable(doc, dataStream, dataStream); for (int i = 0; i < numChar; ++i) { CharacterRun cuRun = range.getCharacterRun(i); boolean hasPic = pTable.hasPicture(cuRun); if (hasPic) { Picture picture = pTable.extractPicture(cuRun, true); try { picture.writeImageContent(new FileOutputStream(picPath + i + suffix)); } catch (Exception e) { e.printStackTrace(); } } } } /** * 獲取word 文檔里面所有表格 * @param doc Document 對象 */ public void getTables(HWPFDocument doc){ Range range = doc.getRange(); TableIterator tableIt = new TableIterator(range); while (tableIt.hasNext()) { Table table = (Table)tableIt.next(); for(int j=0;j<table.numRows();j++){ TableRow tr = table.getRow(j); String content = ""; for(int i=0;i<tr.numCells();i++){ TableCell cell = tr.getCell(i); for(int m=0;m<cell.numParagraphs();m++){ //獲取單元格內(nèi)容 Paragraph para = cell.getParagraph(m); content += para.text().trim() + ";"; } } System.out.println(content); } } } /** * 獲取文章中所有標題集合 * @param doc Document * @return */ public List<String> getTitleList(HWPFDocument doc){ Range range = doc.getRange(); byte[] dataStream = doc.getDataStream(); int numP = range.numParagraphs(); List<String> titleList = new ArrayList<String>(); PicturesTable pTable = new PicturesTable(doc, dataStream, dataStream); for(int i=0;i<numP;i++){ Range curRange = range.getParagraph(i); Paragraph paragraph = range.getParagraph(i); CharacterRun cr = curRange.getCharacterRun(0); if(pTable.hasPicture(cr)){ //圖片 continue; }else{ char currentChar = 0; for(int k=0;k<cr.text().length();k++){ currentChar = cr.text().charAt(k); if(currentChar != Const.SPACE_ASCII){ break; } } if(currentChar == Const.ENTER_ASCII){ //回車符 continue; }else if(currentChar == Const.SPACE_ASCII){ //空格符 continue; }else if(currentChar == Const.TABULATION_ASCII){ //水平制表符 continue; } } int numStyles = doc.getStyleSheet().numStyles(); int styleIndex = paragraph.getStyleIndex(); if (numStyles > styleIndex) { StyleSheet style_sheet = doc.getStyleSheet(); StyleDescription style = style_sheet.getStyleDescription(styleIndex); String styleName = style.getName(); if(styleName!=null&&styleName.contains("標題")){ titleList.add(paragraph.text().trim()); System.out.println(paragraph.text().trim()); } } } return titleList; } /** * 獲取整篇文章中所有標題樣式名稱 * @param doc Document * @return */ public Set<String> getTitleStyleNameSet(HWPFDocument doc){ Range range = doc.getRange(); byte[] dataStream = doc.getDataStream(); int numP = range.numParagraphs(); Set<String> titNameSet = new HashSet<String>(); PicturesTable pTable = new PicturesTable(doc, dataStream, dataStream); for(int i=0;i<numP;i++){ Range curRange = range.getParagraph(i); Paragraph paragraph = range.getParagraph(i); CharacterRun cr = curRange.getCharacterRun(0); if(pTable.hasPicture(cr)){ //圖片 continue; }else{ char currentChar = 0; for(int k=0;k<cr.text().length();k++){ currentChar = cr.text().charAt(k); if(currentChar != Const.SPACE_ASCII){ break; } } if(currentChar == Const.ENTER_ASCII){ //回車符 continue; }else if(currentChar == Const.SPACE_ASCII){ //空格符 continue; }else if(currentChar == Const.TABULATION_ASCII){ //水平制表符 continue; } } int numStyles = doc.getStyleSheet().numStyles(); int styleIndex = paragraph.getStyleIndex(); if (numStyles > styleIndex) { StyleSheet style_sheet = doc.getStyleSheet(); StyleDescription style = style_sheet.getStyleDescription(styleIndex); String styleName = style.getName(); if(styleName!=null&&styleName.contains("標題")){ if(styleName.contains(",")){ styleName = getFirstStyleName(styleName); } titNameSet.add(styleName); } } } return titNameSet; } /** * 處理標題樣式名稱的特殊格式,如:“標題 3,標題 3 Char,標題 3 Char Char” ,只獲取“標題 3” * @param styleName 需進行處理的標題樣式 ,如"標題 3,標題 3 Char,標題 3 Char Char” * @return */ private String getFirstStyleName(String styleName){ if ((styleName != null) && (styleName.length() > 0)) { int styleLeng = styleName.split(",").length; if(styleLeng>1){ int comma = styleName.indexOf(","); if(comma>-1&&(comma<styleName.length())){ return styleName.substring(0,comma); } } } return styleName; } /** * 獲取當(dāng)前文章中最大標題樣式名稱,如“標題1” * @param doc Document * @return */ public String getMaxTitleStyleName(HWPFDocument doc){ Set<String> titNameSet = getTitleStyleNameSet(doc); Iterator<String> it = titNameSet.iterator(); List<Integer> tempLst = new ArrayList<Integer>(); while(it.hasNext()){ String titName = it.next(); //得到“標題 1”、“標題 2” try { int curStyleName = Integer.parseInt(titName.substring(2).trim()); tempLst.add(curStyleName); } catch (NumberFormatException e) { continue; } } int max = (tempLst.size()==0?0:tempLst.get(0)); for(int i=0;i<tempLst.size();i++){ int curSize = tempLst.get(i); if(curSize<max){ max = curSize; } } if(max==0){ return ""; //文章中不包含任何標題 } return "標題 "+max; } /** * 獲取word 文檔中最大的字體 * @param doc Document 對象 */ public int getMaxFontSize(HWPFDocument doc) { int fontSize = 0; try { Range range = doc.getRange(); for (int i = 0; i < range.numParagraphs(); i++) { Paragraph poiPara = range.getParagraph(i); int j = 0; while (true) { CharacterRun run = poiPara.getCharacterRun(j++); if(fontSize < run.getFontSize()) { fontSize = run.getFontSize(); }//字體大小 if (run.getEndOffset() == poiPara.getEndOffset()) { break; } } } } catch (Exception e) { e.printStackTrace(); } return fontSize; } /** * 獲取 word 文檔的創(chuàng)建信息 * @param docPath doc路徑 * @return 創(chuàng)建文檔的信息 */ public Map<String, String> getInfo(String docPath) { try { InputStream is = new FileInputStream(docPath); WordExtractor extractor = new WordExtractor(is); SummaryInformation info = extractor.getSummaryInformation(); Map<String, String> mapInfo = new HashMap<String, String>(); mapInfo.put("author", info.getAuthor()); // 作者 mapInfo.put("title", info.getTitle()); // 標題 mapInfo.put("subject", info.getSubject()); // 主題 mapInfo.put("keyword", info.getKeywords()); // 關(guān)鍵詞 mapInfo.put("createdate", FormatTextUtil.dateFormat(info.getCreateDateTime())); // 創(chuàng)建時間 mapInfo.put("updatedate", FormatTextUtil.dateFormat(info.getLastSaveDateTime())); // 修改時間 } catch (Exception e) { } return null; } } 2007實現(xiàn)類:PoiXwpfExtractContentImpl.java package com.peraglobal.extract.poi.impl; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLTextExtractor; import org.apache.poi.POIXMLProperties.CoreProperties; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFParagraph; import org.apache.poi.xwpf.usermodel.XWPFPictureData; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import com.peraglobal.extract.poi.PoiExtractContent; import com.peraglobal.extract.util.FormatTextUtil; /** * 使用 POI 解析 DOCX2007 文件的內(nèi)容信息 * @author yongqian.liu * 2015-2-9 */ public class PoiXwpfExtractContentImpl implements PoiExtractContent<XWPFDocument> { /** * 根據(jù)文件路徑獲得 Document 對象 * @param docxPath 路徑 * @return Document */ public XWPFDocument getDocument(String docxPath) { //xwpfDocument是專門處理word的,在poi中還有處理其他office文檔的類 XWPFDocument docx = null; try { OPCPackage pack = POIXMLDocument.openPackage(docxPath); docx = new XWPFDocument(pack) ; } catch (Exception e) { e.printStackTrace(); } return docx; } /** * 解析 word 文檔的標題 * @param docx Document 對象 * @return word 文檔中標題 */ public String getTilte(XWPFDocument docx) { String title = "\"; List<XWPFParagraph> paras = docx.getParagraphs(); XWPFParagraph p = null; for (int i = 0; i < paras.size(); i++) { if(p.getText() != null && !p.getText().equals("") && !p.getText().equals("r")){ title = p.getText().trim(); break; } } return title; } /** * 獲取 word 文檔里所有文字內(nèi)容(不包括圖片、表格等格式的內(nèi)容) * @param docx Document 對象 * @return word 文檔中文字部分全部內(nèi)容 */ public String getContent(XWPFDocument docx) { String content = ""; try { List<XWPFParagraph> paras = docx.getParagraphs(); for (XWPFParagraph para : paras) { content += para.getText().trim(); } } catch (Exception e) { e.printStackTrace(); } return content.replaceAll("", ""); } /** * 獲取 word 文檔里所有文字內(nèi)容(不包括圖片、表格等格式的內(nèi)容) * @param docxPath docx 對象路徑 * @return word 文檔中文字部分全部內(nèi)容 */ public String getContent(String docxPath) { String content = ""; try { OPCPackage opcPackage = POIXMLDocument.openPackage(docxPath); POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage); content += extractor.getText().trim(); } catch (Exception e) { e.printStackTrace(); } return content.replaceAll("", ""); } /** * 獲取 word 文檔里面所有圖片并另存到指定目錄下 * @param docx Document 對象 * @param picPath 保存圖片路徑 * @param suffix 后綴名 */ public void getPictures(XWPFDocument docx, String picPath, String suffix){ List<XWPFPictureData> wpdList = docx.getAllPictures(); if(wpdList != null && wpdList.size() > 0){ for (int i = 0; i < wpdList.size(); i++) { byte[] picByte = wpdList.get(i).getData(); //獲取圖片數(shù)據(jù)流 FileOutputStream fos = null; try { fos = new FileOutputStream(picPath + i + suffix); } catch (FileNotFoundException e) { e.printStackTrace(); }finally{ try { fos.write(picByte); } catch (IOException e) { e.printStackTrace(); } } } } } /** * 獲取word 文檔里面所有表格 * @param doc Document 對象 */ public void getTables(XWPFDocument docx){ Iterator<XWPFTable> tableIt = docx.getTablesIterator(); while (tableIt.hasNext()) { XWPFTable table = tableIt.next(); String rowInfo = ""; for(int j = 0; j < table.getRows().size(); j ++){ List<XWPFTableCell> cells = table.getRow(j).getTableCells(); // 獲得所有列 for (int k = 0; k < cells.size(); k++) { rowInfo += cells.get(k).getText().trim() + ";"; } } System.out.println(rowInfo); } } /** * 獲取word 文檔中最大的字體 * @param doc Document 對象 */ public int getMaxFontSize(XWPFDocument docx) { int fontSize = 0; /* List<XWPFParagraph> paraGraph = docx.getParagraphs(); for(XWPFParagraph para :paraGraph ){ List<XWPFRun> run = para.getRuns(); for(XWPFRun r : run){ int i = 0; System.out.println("字體顏色:"+r.getColor()); System.out.println("字體名稱:"+r.getFontFamily()); System.out.println("字體大小:"+r.getFontSize()); System.out.println("Text:"+r.getText(i++)); System.out.println("粗體?:"+r.isBold()); System.out.println("斜體?:"+r.isItalic()); if(fontSize < r.getFontSize()){ fontSize = r.getFontSize(); } } }*/ return fontSize; } /** * 獲取 word 文檔的創(chuàng)建信息 * @param docPath docx路徑 * @return 創(chuàng)建文檔的信息 */ public Map<String, String> getInfo(String docxPath) { try { InputStream is = new FileInputStream(docxPath); XWPFDocument docx = new XWPFDocument(is); XWPFWordExtractor extractor = new XWPFWordExtractor(docx); CoreProperties coreProps = extractor.getCoreProperties(); Map<String, String> mapInfo = new HashMap<String, String>(); //mapInfo.put("category", coreProps.getCategory()); //分類 mapInfo.put("author", coreProps.getCreator()); //創(chuàng)建者 mapInfo.put("title", coreProps.getTitle()); //標題 mapInfo.put("subject", coreProps.getSubject()); // 主題 mapInfo.put("keyword", coreProps.getKeywords()); // 關(guān)鍵詞 mapInfo.put("createdate", FormatTextUtil.dateFormat(coreProps.getCreated())); //創(chuàng)建時間 mapInfo.put("updatedate", FormatTextUtil.dateFormat(coreProps.getLastPrinted())); // 修改時間 } catch (Exception e) { } return null; } }
感謝關(guān)注 Ithao123精品文庫頻道,ithao123.cn是專門為互聯(lián)網(wǎng)人打造的學(xué)習(xí)交流平臺,全面滿足互聯(lián)網(wǎng)人工作與學(xué)習(xí)需求,更多互聯(lián)網(wǎng)資訊盡在 IThao123!
|