package ;
import ;
import ;
import ;
import ;
import ;
import ;
import ;
/*** Extract data from pdf, doc, docx, txt through apache tika
* Core dependency jar [tika-core 2.8.0, tika-parsers-standard-package 2.8.0 (when parsing word, additional dependency is required to be made on xmlbeans 5.1.1)]
* Assume that the content in the document has the following properties:
* [Author (signed): Zhang San
* ID number: 322025199902256056 ]
* The content to be extracted is Zhang San and 322025199902256056. The value of Zhang San and 322025199902256056 will change*/
public class TikaExtrator {
public static void main(String[] args) {
try {
////Replace with the actual PDF file path Test example: such as test.xlsx.
InputStream input = TikaExtrator.class.getClassLoader().getResourceAsStream("Comprehensive Information Query Authorization Letter Test.docx");
String text = extractTextFromFile(input);
("text: " + text);
String name = extractName(text);
String idNumber = extractIdNumber(text);
("Author's Name: " + name);
("Identity card number: " + idNumber);
} catch (IOException e) {
();
}
}
/**
*
* @param inputStream
* @return
* @throws IOException
*/
private static String extractTextFromFile(InputStream inputStream) throws IOException {
Tika tika = new Tika();
try {
return (inputStream);
} catch (TikaException e) {
throw new RuntimeException(e);
}
}
private static String extractName(String text) {
Pattern pattern = ("Author (Signed)[::]([\\u4e00-\\u9fa5]+)");
Matcher matcher = (text);
if (()) {
return (1);
}
return "";
}
private static String extractIdNumber(String text) {
Pattern pattern = ("ID number[::](\\d{18}|\\d{15})");
Matcher matcher = (text);
if (()) {
return (1);
}
return "";
}
}