List of usage examples for com.itextpdf.text.pdf.parser PdfContentStreamProcessor reset
public void reset()
From source file:pl.edu.icm.cermine.structure.ITextCharacterExtractor.java
License:Open Source License
/** * Extracts text chunks from PDF using iText and stores them in BxDocument object. * Depending on parsed PDF, extracted text chunks may or may not be individual glyphs, * they correspond to single string operands of PDF's text-showing operators * (Tj, TJ, ' and ").// ww w . j a v a2s . co m * @param stream PDF's stream * @return BxDocument containing pages with extracted chunks stored as BxChunk lists * @throws AnalysisException AnalysisException */ @Override public BxDocument extractCharacters(InputStream stream) throws AnalysisException { try { BxDocumentCreator documentCreator = new BxDocumentCreator(); PdfReader reader = new PdfReader(stream); PdfContentStreamProcessor processor = new PdfContentStreamProcessor(documentCreator); for (int pageNumber = 1; pageNumber <= reader.getNumberOfPages(); pageNumber++) { if (frontPagesLimit > 0 && backPagesLimit > 0 && pageNumber > frontPagesLimit && pageNumber < reader.getNumberOfPages() - 1 - backPagesLimit) { continue; } documentCreator.processNewBxPage(reader.getPageSize(pageNumber)); PdfDictionary resources = reader.getPageN(pageNumber).getAsDict(PdfName.RESOURCES); processAlternativeFontNames(resources); processAlternativeColorSpace(resources); processor.reset(); processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNumber), resources); TimeoutRegister.get().check(); } BxDocument doc = filterComponents(removeDuplicateChunks(documentCreator.document)); if (doc.getFirstChild() == null) { throw new AnalysisException("Document contains no pages"); } return doc; } catch (InvalidPdfException ex) { throw new AnalysisException("Invalid PDF file", ex); } catch (IOException ex) { throw new AnalysisException("Cannot extract characters from PDF file", ex); } }