Java tutorial
/* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package net.dstserbak.dataindexer.tokenizer; import java.io.*; import org.apache.pdfbox.pdmodel.*; import org.apache.pdfbox.util.*; import org.apache.pdfbox.io.RandomAccessFile; import java.util.StringTokenizer; import java.net.URL; import java.util.logging.Level; import java.util.logging.Logger; /** * Utility class provides methods to tokenize PDF documents. * * @author Deniss Stserbak */ final public class PDFTokenizer { /** * Logger for this class */ private static final Logger log = Logger.getLogger(PDFTokenizer.class.getName()); /** * Temporary file name, that is needed to read large documents. */ private static final File PDF_SCRATCH_FILE = new File("pdfScratchFile.tmp"); private PDFTokenizer() { } /** * Splits text from PDF URL to words and returns them as TokensMap object. * @param url Input PDF URL * @return Map that contains tokens, which are belong to PDF document * @throws IOException If an I/O error occurs */ public static TokensMap tokenizePdf(URL url) throws IOException { checkTempFileExistance(); try (RandomAccessFile scratchFile = new RandomAccessFile(PDF_SCRATCH_FILE, "rw")) { try (PDDocument pd = PDDocument.load(url, scratchFile)) { return tokenizeInput(pd); } } } /** * Checks for TEMP_PDF_FILE existence and deletes it, if true * @throws IOException If an I/O error occurs */ private static void checkTempFileExistance() throws IOException { if (PDF_SCRATCH_FILE.exists()) { PDF_SCRATCH_FILE.delete(); } } /** * Reads all pages of the PDF file and splits text to words, which are * returned as TokensMap object. * @param pd PDDocument object that is created from stream * @return Map that contains tokens, which are belong to PDF document * @throws IOException If an I/O error occurs */ private static TokensMap tokenizeInput(PDDocument pd) throws IOException { int numberOfPages = pd.getNumberOfPages(); if (pd.isEncrypted()) { log.log(Level.SEVERE, "PDF is ecrypted"); return null; } else if (numberOfPages < 1) { log.log(Level.SEVERE, "PDF number of pages is less than 1"); return null; } PDFTextStripper stripper = new PDFTextStripper(); stripper.setStartPage(1); stripper.setEndPage(numberOfPages); StringTokenizer st = new StringTokenizer(stripper.getText(pd)); TokensMap tokensMap = new TokensMap(); TokenizerUtils.addTokensToMap(tokensMap, st); return tokensMap; } }