Example usage for com.itextpdf.text.pdf.parser PdfContentStreamProcessor reset

List of usage examples for com.itextpdf.text.pdf.parser PdfContentStreamProcessor reset

Introduction

In this page you can find the example usage for com.itextpdf.text.pdf.parser PdfContentStreamProcessor reset.

Prototype

public void reset() 

Source Link

Document

Resets the graphics state stack, matrices and resources.

Usage

From source file:pl.edu.icm.cermine.structure.ITextCharacterExtractor.java

License:Open Source License

/**
 * Extracts text chunks from PDF using iText and stores them in BxDocument object.
 * Depending on parsed PDF, extracted text chunks may or may not be individual glyphs,
 * they correspond to single string operands of PDF's text-showing operators
 * (Tj, TJ, ' and ").//  ww  w  . j  a v a2s  . co  m
 * @param stream PDF's stream
 * @return BxDocument containing pages with extracted chunks stored as BxChunk lists
 * @throws AnalysisException AnalysisException
 */
@Override
public BxDocument extractCharacters(InputStream stream) throws AnalysisException {
    try {
        BxDocumentCreator documentCreator = new BxDocumentCreator();

        PdfReader reader = new PdfReader(stream);
        PdfContentStreamProcessor processor = new PdfContentStreamProcessor(documentCreator);

        for (int pageNumber = 1; pageNumber <= reader.getNumberOfPages(); pageNumber++) {
            if (frontPagesLimit > 0 && backPagesLimit > 0 && pageNumber > frontPagesLimit
                    && pageNumber < reader.getNumberOfPages() - 1 - backPagesLimit) {
                continue;
            }
            documentCreator.processNewBxPage(reader.getPageSize(pageNumber));

            PdfDictionary resources = reader.getPageN(pageNumber).getAsDict(PdfName.RESOURCES);
            processAlternativeFontNames(resources);
            processAlternativeColorSpace(resources);

            processor.reset();
            processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNumber), resources);
            TimeoutRegister.get().check();
        }

        BxDocument doc = filterComponents(removeDuplicateChunks(documentCreator.document));
        if (doc.getFirstChild() == null) {
            throw new AnalysisException("Document contains no pages");
        }
        return doc;
    } catch (InvalidPdfException ex) {
        throw new AnalysisException("Invalid PDF file", ex);
    } catch (IOException ex) {
        throw new AnalysisException("Cannot extract characters from PDF file", ex);
    }
}