EJB(xml/mail/jms/jdbc)
2017.07.07 / 10:19
Java - ¹®¼ ÆÄ½Ì¹× ÃßÃâ(pdf, doc, docx, xls, xlsx, ppt, pptx)
Ŭ·¡½Ä·Î¾â
Ãßõ ¼ö 199
# ¹®¼ ÆÄ½Ì¹× ÃßÃâ(pdf, doc, docx, xls, xlsx, ppt, pptx)
# ÇÊ¿ä ¶óÀ̺귯¸®
Apache PDFBox : http://pdfbox.apache.org/downloads.html
Apache POI : http://poi.apache.org/download.html
dom4j-1.6.1.jar
pdfbox-app-1.8.1.jar
poi-3.9-20121203.jar
poi-examples-3.9-20121203.jar
poi-ooxml-3.9-20121203.jar
poi-ooxml-schemas-3.9-20121203.jar
poi-scratchpad-3.9-20121203.jar
xmlbeans-2.3.0.jar
# PDF Æļ
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 | import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.util.PDFTextStripper; /** * This class parses the pdf file. * i.e this class returns the text from the pdf file. * @author Mubin Shrestha */ public class PdfFileParser { public String PdfFileParser(String pdffilePath) throws FileNotFoundException, IOException { String content; FileInputStream fi = new FileInputStream( new File(pdffilePath)); PDFParser parser = new PDFParser(fi); parser.parse(); COSDocument cd = parser.getDocument(); PDFTextStripper stripper = new PDFTextStripper(); content = stripper.getText( new PDDocument(cd)); cd.close(); return content; } public static void main(String args[]) throws FileNotFoundException, IOException { String filepath = "fullPath" ; System.out.println( new PdfFileParser().PdfFileParser(filepath)); } } |
# doc, xls, ppt Æļ
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import java.io.FileInputStream; import org.apache.poi.hslf.extractor.PowerPointExtractor; import org.apache.poi.hssf.extractor.ExcelExtractor; import org.apache.poi.hssf.usermodel.HSSFWorkbook; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.extractor.WordExtractor; import org.apache.poi.poifs.filesystem.POIFSFileSystem; /** * This class parses the microsoft word files except .docx,.pptx and * latest MSword files. * * @author Mubin Shrestha */ public class DocFileParser { public String DocFileContentParser(String fileName) { POIFSFileSystem fs = null ; try { fs = new POIFSFileSystem( new FileInputStream(fileName)); if (fileName.endsWith( ".doc" )) { HWPFDocument doc = new HWPFDocument(fs); WordExtractor we = new WordExtractor(doc); return we.getText(); } else if (fileName.endsWith( ".xls" )) { // HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(fileName)); ExcelExtractor ex = new ExcelExtractor(fs); ex.setFormulasNotResults( true ); ex.setIncludeSheetNames( true ); return ex.getText(); } else if (fileName.endsWith( ".ppt" )) { PowerPointExtractor extractor = new PowerPointExtractor(fs); return extractor.getText(); } } catch (Exception e) { System.out.println( "document file cant be indexed" ); } return "" ; } public static void main(String args[]){ String filepath = "fullPath" ; System.out.println( new DocFileParser().DocFileContentParser(filepath)); } } |
# docx, xlsx, pptx Æļ
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 | import java.io.File; import java.io.FileInputStream; import org.apache.poi.openxml4j.opc.OPCPackage; import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor; import org.apache.poi.xssf.extractor.XSSFExcelExtractor; import org.apache.poi.xwpf.extractor.XWPFWordExtractor; public class DocxFileParser { public String docxFileContentParser(String fileName){ try { FileInputStream fs = new FileInputStream( new File(fileName)); OPCPackage d = OPCPackage.open(fs); if (fileName.endsWith( ".docx" )){ XWPFWordExtractor xw = new XWPFWordExtractor(d); return xw.getText(); } else if (fileName.endsWith( ".pptx" )){ XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d); return xp.getText(); } else if (fileName.endsWith( ".xlsx" )){ XSSFExcelExtractor xe = new XSSFExcelExtractor(d); xe.setFormulasNotResults( true ); xe.setIncludeSheetNames( true ); return xe.getText(); } } catch (Exception e){ System.out.println( "# DocxFileParser Error :" +e.getMessage()); } return "" ; } public static void main(String args[]){ String filePath = "fullPath" ; System.out.println( new DocxFileParser().docxFileContentParser(filePath)); } } |
# Âü°í »çÀÌÆ® : http://computergodzilla.blogspot.kr
÷ºÎÆÄÀÏ
domparser_poi.zip