Java tutorial
/* * (C) Copyright 2011 Nuxeo SAS (http://nuxeo.com/) and contributors. * * All rights reserved. This program and the accompanying materials * are made available under the terms of the GNU Lesser General Public License * (LGPL) version 2.1 which accompanies this distribution, and is available at * http://www.gnu.org/licenses/lgpl.html * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * Contributors: * Nuxeo - initial API and implementation */ package org.nuxeo.ecm.platform.ocr.service.impl; import java.io.IOException; import java.io.InputStream; import java.io.Serializable; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.nuxeo.ecm.core.api.Blob; import org.nuxeo.ecm.core.api.blobholder.BlobHolder; import org.nuxeo.ecm.core.api.blobholder.SimpleBlobHolder; import org.nuxeo.ecm.core.convert.api.ConversionException; import org.nuxeo.ecm.core.convert.api.ConversionService; import org.nuxeo.ecm.core.convert.api.ConverterCheckResult; import org.nuxeo.ecm.platform.ocr.service.DocumentStructure; import org.nuxeo.ecm.platform.ocr.service.ImageRegion; import org.nuxeo.ecm.platform.ocr.service.MissingCommandLineToolException; import org.nuxeo.ecm.platform.ocr.service.OcrException; import org.nuxeo.ecm.platform.ocr.service.OcrService; import org.nuxeo.ecm.platform.ocr.service.TextRegion; import org.nuxeo.runtime.api.Framework; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Implementation of the OcrService based on the Onela command line interface. */ public class OcrServiceImpl implements OcrService { public static final String OLENA_CONVERTER_NAME = "olena_content_in_doc"; public static final Pattern TESSERACT_GIBBERISH = Pattern.compile(// "[\\\\,'\u2019\u2018\"\u201C \u201D`&\u00A7@~.:;!?_%" + "\u00A5\u00A3\u20AC\u00A2\u00B0*=<>\u00BB\\-]"); private static final Log log = LogFactory.getLog(OcrServiceImpl.class); /** * Helper method to check the availability of the Olena command line tool * and help build meaningful error messages. */ protected ConverterCheckResult getAvailability(ConversionService service, boolean refresh) throws OcrException { if (service == null) { try { service = Framework.getService(ConversionService.class); } catch (Exception e) { throw new OcrException("could not access the ConversionService", e); } if (service == null) { throw new OcrException("could not access the ConversionService"); } } ConverterCheckResult availability; try { availability = service.isConverterAvailable(OLENA_CONVERTER_NAME, refresh); } catch (ConversionException e) { throw new OcrException("failed to check availability of " + OLENA_CONVERTER_NAME, e); } return availability; } @Override public boolean isEnabled() { try { return getAvailability(null, true).isAvailable(); } catch (OcrException e) { log.warn("unexpected exception while checking availability for: " + OLENA_CONVERTER_NAME, e); return false; } } @Override public List<String> extractText(Blob imageBlob) throws OcrException { DocumentStructure structure = extractDocumentStructure(imageBlob); List<String> aggregateText = new ArrayList<String>(); for (TextRegion textRegion : structure.getTextRegions()) { aggregateText.addAll(textRegion.paragraphs); } for (ImageRegion imageRegion : structure.getImageRegions()) { if (imageRegion.embeddedText != null) { aggregateText.add(imageRegion.embeddedText); } } return aggregateText; } @Override public DocumentStructure extractDocumentStructure(Blob imageBlob) throws OcrException { ConversionService conversionService; try { conversionService = Framework.getService(ConversionService.class); } catch (Exception e) { throw new OcrException("could not access the ConversionService", e); } ConverterCheckResult availability = getAvailability(conversionService, false); if (!availability.isAvailable()) { throw new MissingCommandLineToolException( availability.getErrorMessage() + " " + availability.getInstallationMessage()); } try { BlobHolder xmlBlobHolder = conversionService.convert(OLENA_CONVERTER_NAME, new SimpleBlobHolder(imageBlob), new HashMap<String, Serializable>()); Blob xmlBlob = xmlBlobHolder.getBlob(); if (xmlBlob == null || xmlBlob.getLength() == 0) { throw new OcrException("Unexpected empty XML output for " + OLENA_CONVERTER_NAME); } log.debug(xmlBlob.getString()); return parseXml(xmlBlob.getStream()); } catch (Exception e) { throw new OcrException(e.getMessage(), e); } } protected DocumentStructure parseXml(InputStream xmlStream) throws OcrException { DocumentBuilder builder; List<TextRegion> textRegions = new ArrayList<TextRegion>(); List<ImageRegion> imageRegions = new ArrayList<ImageRegion>(); try { builder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); Document document = builder.parse(xmlStream); XPath xpath = XPathFactory.newInstance().newXPath(); // iterate over all the text regions to extract the text content and // the global position of the region NodeList textRegionNodes = (NodeList) xpath.evaluate("//TextRegion", document, XPathConstants.NODESET); for (int i = 0; i < textRegionNodes.getLength(); i++) { Node textRegionNode = textRegionNodes.item(i); NodeList xcoordAttrs = (NodeList) xpath.evaluate("Coords/Point/@x", textRegionNode, XPathConstants.NODESET); int topLeftX = -1; int bottomRightX = -1; for (int xi = 0; xi < xcoordAttrs.getLength(); xi++) { int x = Integer.valueOf(((Attr) xcoordAttrs.item(xi)).getValue()); if (topLeftX == -1 || topLeftX > x) { topLeftX = x; } if (bottomRightX == -1 || bottomRightX < x) { bottomRightX = x; } } NodeList ycoordAttrs = (NodeList) xpath.evaluate("Coords/Point/@y", textRegionNode, XPathConstants.NODESET); int topLeftY = -1; int bottomRightY = -1; for (int yi = 0; yi < ycoordAttrs.getLength(); yi++) { int y = Integer.valueOf(((Attr) ycoordAttrs.item(yi)).getValue()); if (topLeftY == -1 || topLeftY > y) { topLeftY = y; } if (bottomRightY == -1 || bottomRightY < y) { bottomRightY = y; } } if (topLeftX == -1 || topLeftY == -1 || bottomRightX == -1 || bottomRightY == -1 || topLeftX == bottomRightX || topLeftY == bottomRightY) { continue; } TextRegion textRegion = new TextRegion(topLeftX, topLeftY, bottomRightX, bottomRightY); NodeList textNodes = (NodeList) xpath.evaluate("Line/@text", textRegionNode, XPathConstants.NODESET); StringBuilder sb = new StringBuilder(); for (int k = 0; k < textNodes.getLength(); k++) { Attr textAttr = (Attr) textNodes.item(k); String line = textAttr.getValue(); if (line.endsWith("-") || line.endsWith("\u2010") || line.endsWith("\u2011")) { // special handling for hyphens sb.append(line.substring(0, line.length() - 1)); } else { sb.append(line); sb.append(" "); } } String paragraph = sb.toString().trim(); // ignore empty paragraphs if (!paragraph.isEmpty()) { Matcher matcher = TESSERACT_GIBBERISH.matcher(paragraph); String cleaned = matcher.replaceAll(""); if (cleaned.length() > 0.5 * paragraph.length()) { // less than 50% of non-text chars, this is most likely // NOT an artifact of the OCR, keep the paragraph textRegion.paragraphs.add(paragraph); } } if (!textRegion.paragraphs.isEmpty()) { textRegions.add(textRegion); } } // iterate over all the image regions to extract their global // position along with any embedded text NodeList imageRegionNodes = (NodeList) xpath.evaluate("//ImageRegion", document, XPathConstants.NODESET); for (int i = 0; i < imageRegionNodes.getLength(); i++) { Node imageRegionNode = imageRegionNodes.item(i); NodeList xcoordAttrs = (NodeList) xpath.evaluate("Coords/Point/@x", imageRegionNode, XPathConstants.NODESET); int topLeftX = -1; int bottomRightX = -1; for (int xi = 0; xi < xcoordAttrs.getLength(); xi++) { int x = Integer.valueOf(((Attr) xcoordAttrs.item(xi)).getValue()); if (topLeftX == -1 || topLeftX > x) { topLeftX = x; } if (bottomRightX == -1 || bottomRightX < x) { bottomRightX = x; } } NodeList ycoordAttrs = (NodeList) xpath.evaluate("Coords/Point/@y", imageRegionNode, XPathConstants.NODESET); int topLeftY = -1; int bottomRightY = -1; for (int yi = 0; yi < ycoordAttrs.getLength(); yi++) { int y = Integer.valueOf(((Attr) ycoordAttrs.item(yi)).getValue()); if (topLeftY == -1 || topLeftY > y) { topLeftY = y; } if (bottomRightY == -1 || bottomRightY < y) { bottomRightY = y; } } if (topLeftX > -1 && topLeftY > -1 && bottomRightX > -1 && bottomRightY > -1 && topLeftX < bottomRightX && topLeftY < bottomRightY) { ImageRegion imageRegion = new ImageRegion(topLeftX, topLeftY, bottomRightX, bottomRightY); // TODO: extract embedded text if any imageRegions.add(imageRegion); } } return new DocumentStructureImpl(textRegions, imageRegions); } catch (ParserConfigurationException e) { throw new OcrException(e); } catch (SAXException e) { throw new OcrException(e); } catch (IOException e) { throw new OcrException(e); } catch (XPathExpressionException e) { throw new OcrException(e); } } }