Example usage for com.lowagie.text.pdf PdfContentByte TEXT_RENDER_MODE_INVISIBLE

List of usage examples for com.lowagie.text.pdf PdfContentByte TEXT_RENDER_MODE_INVISIBLE

Introduction

In this page you can find the example usage for com.lowagie.text.pdf PdfContentByte TEXT_RENDER_MODE_INVISIBLE.

Prototype

int TEXT_RENDER_MODE_INVISIBLE

To view the source code for com.lowagie.text.pdf PdfContentByte TEXT_RENDER_MODE_INVISIBLE.

Click Source Link

Document

A possible text rendering value

Usage

From source file:mx.randalf.digital.ocr.hocrtopdf.HocrToPdf.java

License:Open Source License

public void hocrToPdf(File fImg, File fHtml, File fPdf) throws IOException, DocumentException, Exception {
    URL inputHOCRFile = null;//from  w  w  w .j a v  a2  s  .  c o  m
    FileOutputStream outputPDFStream = null;
    // The resolution of a PDF file (using iText) is 72pt per inch
    float pointsPerInch = 72.0f;
    Source source = null;
    StartTag pageTag = null;
    Pattern imagePattern = null;
    Matcher imageMatcher = null;
    // Load the image
    Image pageImage = null;
    float dotsPerPointX;
    float dotsPerPointY;
    float pageImagePixelHeight;
    Document pdfDocument = null;
    PdfWriter pdfWriter = null;
    Font defaultFont = null;
    PdfContentByte cb = null;
    Pattern bboxPattern = null;
    Pattern bboxCoordinatePattern = null;
    StartTag ocrLineTag = null;

    try {
        try {
            inputHOCRFile = new URL("file://" + fHtml.getAbsolutePath());
        } catch (MalformedURLException e) {
            throw e;
        }
        try {
            outputPDFStream = new FileOutputStream(fPdf);
        } catch (FileNotFoundException e) {
            throw e;
        }

        // Using the jericho library to parse the HTML file
        source = new Source(inputHOCRFile);

        // Find the tag of class ocr_page in order to load the scanned image
        pageTag = source.findNextStartTag(0, "class", "ocr_page", false);
        imagePattern = Pattern.compile("image\\s+([^;]+)");
        imageMatcher = imagePattern.matcher(pageTag.getElement().getAttributeValue("title"));
        if (!imageMatcher.find()) {
            throw new Exception("Could not find a tag of class \"ocr_page\", aborting.");
        }

        try {
            pageImage = Image.getInstance(new URL("file://" + fImg.getAbsolutePath()));
        } catch (MalformedURLException e) {
            throw e;
        }
        dotsPerPointX = pageImage.getDpiX() / pointsPerInch;
        dotsPerPointY = pageImage.getDpiY() / pointsPerInch;
        pageImagePixelHeight = pageImage.getHeight();
        pdfDocument = new Document(
                new Rectangle(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY));
        pdfWriter = PdfWriter.getInstance(pdfDocument, outputPDFStream);
        pdfDocument.open();

        // first define a standard font for our text
        defaultFont = FontFactory.getFont(FontFactory.HELVETICA, 8, Font.BOLD, CMYKColor.BLACK);

        // Put the text behind the picture (reverse for debugging)
        cb = pdfWriter.getDirectContentUnder();
        //PdfContentByte cb = pdfWriter.getDirectContent();

        pageImage.scaleToFit(pageImage.getWidth() / dotsPerPointX, pageImage.getHeight() / dotsPerPointY);
        pageImage.setAbsolutePosition(0, 0);
        // Put the image in front of the text (reverse for debugging)
        pdfWriter.getDirectContent().addImage(pageImage);
        //pdfWriter.getDirectContentUnder().addImage(pageImage);

        // In order to place text behind the recognised text snippets we are interested in the bbox property      
        bboxPattern = Pattern.compile("bbox(\\s+\\d+){4}");
        // This pattern separates the coordinates of the bbox property
        bboxCoordinatePattern = Pattern.compile("(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)");
        // Only tags of the ocr_line class are interesting
        ocrLineTag = source.findNextStartTag(0, "class", "ocr_line", false);
        while (ocrLineTag != null) {
            au.id.jericho.lib.html.Element lineElement = ocrLineTag.getElement();
            Matcher bboxMatcher = bboxPattern.matcher(lineElement.getAttributeValue("title"));
            if (bboxMatcher.find()) {
                // We found a tag of the ocr_line class containing a bbox property
                Matcher bboxCoordinateMatcher = bboxCoordinatePattern.matcher(bboxMatcher.group());
                bboxCoordinateMatcher.find();
                int[] coordinates = { Integer.parseInt((bboxCoordinateMatcher.group(1))),
                        Integer.parseInt((bboxCoordinateMatcher.group(2))),
                        Integer.parseInt((bboxCoordinateMatcher.group(3))),
                        Integer.parseInt((bboxCoordinateMatcher.group(4))) };
                String line = lineElement.getContent().extractText();
                //               float bboxWidthPt = (coordinates[2] - coordinates[0]) / dotsPerPointX;
                float bboxHeightPt = (coordinates[3] - coordinates[1]) / dotsPerPointY;

                // Put the text into the PDF
                cb.beginText();
                // Comment the next line to debug the PDF output (visible Text)
                cb.setTextRenderingMode(PdfContentByte.TEXT_RENDER_MODE_INVISIBLE);
                // TODO: Scale the text width to fit the OCR bbox
                cb.setFontAndSize(defaultFont.getBaseFont(), Math.round(bboxHeightPt));
                cb.moveText((float) (coordinates[0] / dotsPerPointX),
                        (float) ((pageImagePixelHeight - coordinates[3]) / dotsPerPointY));
                cb.showText(line);
                cb.endText();
            }
            ocrLineTag = source.findNextStartTag(ocrLineTag.getEnd(), "class", "ocr_line", false);
        }
    } catch (NumberFormatException e) {
        throw e;
    } catch (MalformedURLException e) {
        throw e;
    } catch (FileNotFoundException e) {
        throw e;
    } catch (BadElementException e) {
        throw e;
    } catch (IOException e) {
        throw e;
    } catch (DocumentException e) {
        throw e;
    } catch (Exception e) {
        throw e;
    } finally {
        if (pdfDocument != null) {
            pdfDocument.close();
        }
        if (outputPDFStream != null) {
            outputPDFStream.close();
        }
    }
}