com.cloudera.recordbreaker.learnstructure.test.InferenceTest.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.recordbreaker.learnstructure.test.InferenceTest.java

Source

/*
 * Copyright (c) 2011, Cloudera, Inc. All Rights Reserved.
 *
 * Cloudera, Inc. licenses this file to you under the Apache License,
 * Version 2.0 (the "License"). You may not use this file except in
 * compliance with the License. You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * This software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 * CONDITIONS OF ANY KIND, either express or implied. See the License for
 * the specific language governing permissions and limitations under the
 * License.
 */
package com.cloudera.recordbreaker.learnstructure.test;

import java.io.File;
import java.io.IOException;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.Iterator;

import org.apache.hadoop.fs.Path;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.conf.Configuration;
import org.apache.avro.file.DataFileReader;
import org.apache.avro.generic.GenericDatumReader;

import org.junit.rules.TemporaryFolder;
import com.cloudera.recordbreaker.learnstructure.LearnStructure;

/**
 * TestInference tests the LearnStructure component's structure-inference code.
 *
 * @author "Michael Cafarella" <mjc@cloudera.com>
 * @version 1.0
 * @since 1.0
 */
public abstract class InferenceTest {
    private static double MIN_PARSE_RATIO = 0.85;
    static File sampleDir = new File(System.getProperty("test.samples.dir", "src/samples"), "textdata");

    /**
     * runSingletonTest() executes LearnStructure test for a single given input text file.
     *
     * @param inputData a <code>File</code> value
     * @return a <code>boolean</code> value;  did the test succeed?
     */
    boolean runSingletonTest(File workingDir, File inputData) {
        File tmpSingletonDir = new File(workingDir, "testinference-" + inputData.getName());
        try {
            FileSystem localFS = FileSystem.getLocal(new Configuration());
            tmpSingletonDir.mkdir();
            Path schemaFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.SCHEMA_FILENAME);
            Path parseTreeFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.PARSER_FILENAME);
            Path jsonDataFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.JSONDATA_FILENAME);
            Path avroFile = new Path(tmpSingletonDir.getCanonicalPath(), LearnStructure.DATA_FILENAME);

            LearnStructure ls = new LearnStructure();
            // Check to see how many records exist in the original input
            int lineCount = 0;
            BufferedReader in2 = new BufferedReader(new FileReader(inputData));
            try {
                while (in2.readLine() != null) {
                    lineCount++;
                }
            } finally {
                in2.close();
            }

            // Infer structure
            ls.inferRecordFormat(localFS, new Path(inputData.getCanonicalPath()), localFS, schemaFile,
                    parseTreeFile, jsonDataFile, avroFile, false, lineCount);

            // Test the inferred structure
            // First, load in the avro file and see how many records there are.
            int avroCount = 0;
            DataFileReader in = new DataFileReader(new File(avroFile.toString()), new GenericDatumReader());
            try {
                Iterator it = in.iterator();
                while (it.hasNext()) {
                    avroCount++;
                    it.next();
                }
            } finally {
                in.close();
            }

            // Was the synthesized parser able to figure out the file?
            double parseRatio = avroCount / (1.0 * lineCount);
            return (parseRatio > MIN_PARSE_RATIO);
        } catch (IOException e) {
            try {
                System.err.println("File: " + inputData.getCanonicalPath());
            } catch (IOException ex) {
                ex.printStackTrace();
            }
            e.printStackTrace();
            return false;
        } finally {
            // remove temp files
            tmpSingletonDir.delete();
        }
    }
}