lucene5(6)读取word、pdf内容

No bibi 亮代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
/**
* 读取doc
*/
@Test
public void readWord2003() throws Exception{
InputStream is = new FileInputStream("E:/test.doc");
HWPFDocument doc2003 = new HWPFDocument(is);
WordExtractor word2003 = new WordExtractor(doc2003);
System.out.println(word2003.getText());
word2003.close();
doc2003.close();
is.close();
}

/**
* 读取docx
*/
@Test
public void readWord2007() throws Exception{
InputStream is = new FileInputStream("E:/test.docx");
XWPFDocument doc2007 = new XWPFDocument(is);
XWPFWordExtractor word2007 = new XWPFWordExtractor(doc2007);
System.out.println(word2007.getText());
word2007.close();
doc2007.close();
is.close();
}

/**
* 读取pdf
*/
@Test
public void readPDF() throws Exception{
InputStream is = new FileInputStream("E:/test.pdf");
PDDocument document=PDDocument.load(is);
//获取一个PDFTextStripper文本剥离对象
PDFTextStripper stripper = new PDFTextStripper();
//获取文本内容
String content = stripper.getText(document);
System.out.println(content);
document.close();
is.close();
}