Location>code7788 >text

Extract feature data from documents (pdf, doc, docx, txt) through apache tika

Popularity:145 ℃/2025-04-23 11:15:45
package ; import ; import ; import ; import ; import ; import ; import ; /*** Extract data from pdf, doc, docx, txt through apache tika * Core dependency jar [tika-core 2.8.0, tika-parsers-standard-package 2.8.0 (when parsing word, additional dependency is required to be made on xmlbeans 5.1.1)] * Assume that the content in the document has the following properties: * [Author (signed): Zhang San * ID number: 322025199902256056 ] * The content to be extracted is Zhang San and 322025199902256056. The value of Zhang San and 322025199902256056 will change*/ public class TikaExtrator { public static void main(String[] args) { try { ////Replace with the actual PDF file path Test example: such as test.xlsx. InputStream input = TikaExtrator.class.getClassLoader().getResourceAsStream("Comprehensive Information Query Authorization Letter Test.docx"); String text = extractTextFromFile(input); ("text: " + text); String name = extractName(text); String idNumber = extractIdNumber(text); ("Author's Name: " + name); ("Identity card number: " + idNumber); } catch (IOException e) { (); } } /** * * @param inputStream * @return * @throws IOException */ private static String extractTextFromFile(InputStream inputStream) throws IOException { Tika tika = new Tika(); try { return (inputStream); } catch (TikaException e) { throw new RuntimeException(e); } } private static String extractName(String text) { Pattern pattern = ("Author (Signed)[::]([\\u4e00-\\u9fa5]+)"); Matcher matcher = (text); if (()) { return (1); } return ""; } private static String extractIdNumber(String text) { Pattern pattern = ("ID number[::](\\d{18}|\\d{15})"); Matcher matcher = (text); if (()) { return (1); } return ""; } }