EJB(xml/mail/jms/jdbc)
2017.07.07 / 10:19

Java - ¹®¼­ ÆÄ½Ì¹× ÃßÃâ(pdf, doc, docx, xls, xlsx, ppt, pptx)

Ŭ·¡½Ä·Î¾â
Ãßõ ¼ö 199

# ¹®¼­ ÆÄ½Ì¹× ÃßÃâ(pdf, doc, docx, xls, xlsx, ppt, pptx)


# ÇÊ¿ä ¶óÀ̺귯¸®

Apache PDFBox : http://pdfbox.apache.org/downloads.html
Apache POI
 : http://poi.apache.org/download.html

dom4j-1.6.1.jar
pdfbox-app-1.8.1.jar
poi-3.9-20121203.jar
poi-examples-3.9-20121203.jar
poi-ooxml-3.9-20121203.jar
poi-ooxml-schemas-3.9-20121203.jar
poi-scratchpad-3.9-20121203.jar
xmlbeans-2.3.0.jar

# PDF Æļ­

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
 
/**
 * This class parses the pdf file.
 * i.e this class returns the text from the pdf file.
 * @author Mubin Shrestha
 */
public class PdfFileParser {
 
    public String PdfFileParser(String pdffilePath) throws FileNotFoundException, IOException
    {
        String content;
        FileInputStream fi = new FileInputStream(new File(pdffilePath));
        PDFParser parser = new PDFParser(fi);
        parser.parse();
        COSDocument cd = parser.getDocument();
        PDFTextStripper stripper = new PDFTextStripper();
        content = stripper.getText(new PDDocument(cd));
        cd.close();
        return content;
    }
     
    public static void main(String args[]) throws FileNotFoundException, IOException
    {
        String filepath = "fullPath";
        System.out.println(new PdfFileParser().PdfFileParser(filepath));   
    }
}



# doc, xls, ppt Æļ­

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import java.io.FileInputStream;
import org.apache.poi.hslf.extractor.PowerPointExtractor;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 
/**
* This class parses the microsoft word files except .docx,.pptx and
* latest MSword files.
*
* @author Mubin Shrestha
*/
public class DocFileParser {
   
  public String DocFileContentParser(String fileName) {
      POIFSFileSystem fs = null;
      try {
          
          fs = new POIFSFileSystem(new FileInputStream(fileName));
           
          if(fileName.endsWith(".doc")) {
              HWPFDocument doc = new HWPFDocument(fs);
              WordExtractor we = new WordExtractor(doc);
              return we.getText();
          }else if(fileName.endsWith(".xls")) {
//              HSSFWorkbook wb = new HSSFWorkbook(new FileInputStream(fileName));
              ExcelExtractor ex = new ExcelExtractor(fs);
              ex.setFormulasNotResults(true);
              ex.setIncludeSheetNames(true);
              return ex.getText();
          } else if (fileName.endsWith(".ppt")) {
              PowerPointExtractor extractor = new PowerPointExtractor(fs);
              return extractor.getText();
          }
 
      } catch (Exception e) {
          System.out.println("document file cant be indexed");
      }
      return "";
  }
 
  public static void main(String args[]){
      String filepath = "fullPath";
      System.out.println(new DocFileParser().DocFileContentParser(filepath));
  }
}


# docx, xlsx, pptx Æļ­

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import java.io.File;
import java.io.FileInputStream;
 
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xslf.extractor.XSLFPowerPointExtractor;
import org.apache.poi.xssf.extractor.XSSFExcelExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
 
public class DocxFileParser {
 
    public String docxFileContentParser(String fileName){
 
        try{
            FileInputStream fs = new FileInputStream(new File(fileName));
            OPCPackage d = OPCPackage.open(fs);
            if(fileName.endsWith(".docx")){
                XWPFWordExtractor xw = new XWPFWordExtractor(d);
                return xw.getText();
            }else if(fileName.endsWith(".pptx")){
                XSLFPowerPointExtractor xp = new XSLFPowerPointExtractor(d);
                return xp.getText();
            }else if(fileName.endsWith(".xlsx")){
                XSSFExcelExtractor xe = new XSSFExcelExtractor(d);
                xe.setFormulasNotResults(true);
                xe.setIncludeSheetNames(true);
                return xe.getText();
            }
        }catch(Exception e){
            System.out.println("# DocxFileParser Error :"+e.getMessage());
        }
        return "";
    }
 
    public static void main(String args[]){
        String filePath = "fullPath";
        System.out.println(new DocxFileParser().docxFileContentParser(filePath));
    }   
}



# Âü°í »çÀÌÆ® : http://computergodzilla.blogspot.kr