org.apache.tika.parser.ocr.TesseractOCRConfigTest.java Source code

Java tutorial

Introduction

Here is the source code for org.apache.tika.parser.ocr.TesseractOCRConfigTest.java

Source

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.tika.parser.ocr;

import org.apache.commons.lang.SystemUtils;
import org.apache.tika.TikaTest;
import org.junit.Test;

import java.io.File;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

public class TesseractOCRConfigTest extends TikaTest {

    @Test
    public void testNoConfig() throws Exception {
        TesseractOCRConfig config = new TesseractOCRConfig();
        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
        assertEquals("Invalid default language value", "eng", config.getLanguage());
        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
        assertEquals("Invalid default minFileSizeToOcr value", 0, config.getMinFileSizeToOcr());
        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
        assertEquals("Invalid default timeout value", 120, config.getTimeout());
        assertEquals("Invalid default ImageMagickPath value", "", config.getImageMagickPath());
        assertEquals("Invalid default density value", 300, config.getDensity());
        assertEquals("Invalid default depth value", 4, config.getDepth());
        assertEquals("Invalid default colorpsace value", "gray", config.getColorspace());
        assertEquals("Invalid default filter value", "triangle", config.getFilter());
        assertEquals("Invalid default resize value", 900, config.getResize());
    }

    @Test
    public void testPartialConfig() throws Exception {

        InputStream stream = TesseractOCRConfigTest.class
                .getResourceAsStream("/test-properties/TesseractOCRConfig-partial.properties");

        TesseractOCRConfig config = new TesseractOCRConfig(stream);
        assertEquals("Invalid default tesseractPath value", "", config.getTesseractPath());
        assertEquals("Invalid default tessdataPath value", "", config.getTessdataPath());
        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
        assertEquals("Invalid default pageSegMode value", "1", config.getPageSegMode());
        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
        assertEquals("Invalid default maxFileSizeToOcr value", Integer.MAX_VALUE, config.getMaxFileSizeToOcr());
        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
        assertEquals("Invalid default ImageMagickPath value", "", config.getImageMagickPath());
        assertEquals("Invalid overridden density value", 200, config.getDensity());
        assertEquals("Invalid overridden depth value", 8, config.getDepth());
        assertEquals("Invalid overridden filter value", "box", config.getFilter());
        assertEquals("Invalid overridden resize value", 300, config.getResize());
    }

    @Test
    public void testFullConfig() throws Exception {

        InputStream stream = TesseractOCRConfigTest.class
                .getResourceAsStream("/test-properties/TesseractOCRConfig-full.properties");

        TesseractOCRConfig config = new TesseractOCRConfig(stream);
        if (SystemUtils.IS_OS_UNIX) {
            assertEquals("Invalid overridden tesseractPath value", "/opt/tesseract" + File.separator,
                    config.getTesseractPath());
            assertEquals("Invalid overridden tesseractPath value", "/usr/local/share" + File.separator,
                    config.getTessdataPath());
            assertEquals("Invalid overridden ImageMagickPath value", "/usr/local/bin/",
                    config.getImageMagickPath());
        }
        assertEquals("Invalid overridden language value", "fra+deu", config.getLanguage());
        assertEquals("Invalid overridden pageSegMode value", "2", config.getPageSegMode());
        assertEquals("Invalid overridden minFileSizeToOcr value", 1, config.getMinFileSizeToOcr());
        assertEquals("Invalid overridden maxFileSizeToOcr value", 2000000, config.getMaxFileSizeToOcr());
        assertEquals("Invalid overridden timeout value", 240, config.getTimeout());
        assertEquals("Invalid overridden density value", 200, config.getDensity());
        assertEquals("Invalid overridden depth value", 8, config.getDepth());
        assertEquals("Invalid overridden filter value", "box", config.getFilter());
        assertEquals("Invalid overridden resize value", 300, config.getResize());
    }

    @Test
    public void testValidateValidLanguage() {
        List<String> validLanguages = Arrays.asList("eng", "slk_frak", "chi_tra", "eng+fra",
                "tgk+chi_tra+slk_frak");

        TesseractOCRConfig config = new TesseractOCRConfig();

        for (String language : validLanguages) {
            config.setLanguage(language);
            assertEquals("Valid language not set", language, config.getLanguage());
        }
    }

    @Test
    public void testValidateInvalidLanguage() {
        List<String> invalidLanguages = Arrays.asList("", "+", "en", "en+", "eng+fra+", "rm -rf *");

        TesseractOCRConfig config = new TesseractOCRConfig();

        for (String language : invalidLanguages) {
            try {
                config.setLanguage(language);
                fail("Invalid language set: " + language);
            } catch (IllegalArgumentException e) {
                // expected exception thrown
            }
        }
    }

    @Test(expected = IllegalArgumentException.class)
    public void testValidatePageSegMode() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setPageSegMode("0");
        config.setPageSegMode("10");
        assertTrue("Couldn't set valid values", true);
        config.setPageSegMode("14");
    }

    @Test(expected = IllegalArgumentException.class)
    public void testValidateDensity() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setDensity(300);
        config.setDensity(400);
        assertTrue("Couldn't set valid values", true);
        config.setDensity(1);
    }

    @Test(expected = IllegalArgumentException.class)
    public void testValidateDepth() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setDepth(4);
        config.setDepth(8);
        assertTrue("Couldn't set valid values", true);
        config.setDepth(6);
    }

    @Test(expected = IllegalArgumentException.class)
    public void testValidateFilter() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setFilter("Triangle");
        config.setFilter("box");
        assertTrue("Couldn't set valid values", true);
        config.setFilter("abc");
    }

    @Test(expected = IllegalArgumentException.class)
    public void testValidateResize() {
        TesseractOCRConfig config = new TesseractOCRConfig();
        config.setResize(200);
        config.setResize(400);
        assertTrue("Couldn't set valid values", true);
        config.setResize(1000);
    }

}