com.cloudera.knittingboar.io.TestInputRecordsSplit.java Source code

Java tutorial

Introduction

Here is the source code for com.cloudera.knittingboar.io.TestInputRecordsSplit.java

Source

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.cloudera.knittingboar.io;

import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.TextInputFormat;

import junit.framework.TestCase;

public class TestInputRecordsSplit extends TestCase {

    private static final Log LOG = LogFactory.getLog(TestInputRecordsSplit.class.getName());

    private static int MAX_LENGTH = 1000;

    private static JobConf defaultConf = new JobConf();

    private static FileSystem localFs = null;
    static {
        try {
            defaultConf.set("fs.defaultFS", "file:///");
            localFs = FileSystem.getLocal(defaultConf);
        } catch (IOException e) {
            throw new RuntimeException("init failure", e);
        }
    }

    private static Path workDir = new Path(new Path(System.getProperty("test.build.data", "/tmp")),
            "TestInputRecordsSplit").makeQualified(localFs);

    /**
     * create an InputRecordSplit and then read some records
     * 
     * - make sure we maintain split discipline
     * @throws IOException 
     * 
     */
    public void testReadSplitViaInputRecordsSplit() throws IOException {

        // InputRecordsSplit(JobConf jobConf, InputSplit split)

        // needs to get a jobConf from somewhere, under the hood

        // needs a split calculated from the aforementioned jobConf

        JobConf job = new JobConf(defaultConf);
        Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt");

        int tmp_file_size = 2000;

        long block_size = localFs.getDefaultBlockSize();

        System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

        Writer writer = new OutputStreamWriter(localFs.create(file));
        try {
            for (int i = 0; i < tmp_file_size; i++) {
                writer.write(
                        "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
                writer.write("\n");
            }
        } finally {
            writer.close();
        }

        System.out.println("file write complete, wrote " + tmp_file_size + " recs");

        // A reporter that does nothing
        Reporter reporter = Reporter.NULL;

        System.out.println("> setting splits for: " + workDir);

        //    localFs.delete(workDir, true);
        FileInputFormat.setInputPaths(job, file);

        // try splitting the file in a variety of sizes
        TextInputFormat format = new TextInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();

        int numSplits = 1;

        InputSplit[] splits = format.getSplits(job, numSplits);

        LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

        System.out.println("---- debug splits --------- ");

        //InputSplit test_split = null;

        int total_read = 0;

        for (int x = 0; x < splits.length; x++) {

            System.out.println("> Split [" + x + "]: " + splits[x].getLength());

            int count = 0;
            InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
            while (custom_reader.next(value)) {

                count++;
                //

            }

            System.out.println("read: " + count + " records for split " + x);

            total_read += count;

        } // for each split

        System.out.println("--------- total read across all splits: " + total_read);

        assertEquals(tmp_file_size, total_read);

    }

    /*
    public void testRCV1Splits() throws IOException {
          
      String file_rcv1 = "/Users/jpatterson/Downloads/rcv1/rcv1.train.vw";
          
      System.out.println( "testRCV1Splits >> " + file_rcv1 );
          
      JobConf job = new JobConf(defaultConf);
      Path file = new Path(file_rcv1);
          
          
          
      FileInputFormat.setInputPaths(job, file);
        
        
      // try splitting the file in a variety of sizes
      TextInputFormat format = new TextInputFormat();
      format.configure(job);
      LongWritable key = new LongWritable();
      Text value = new Text();
        
      int numSplits = 1;
          
      InputSplit[] splits = format.getSplits(job, numSplits);
          
      LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
          
      System.out.println( "---- debug splits --------- " );
          
      //InputSplit test_split = null;
          
      int total_read = 0;
          
      for (int x = 0; x < splits.length; x++) {
            
        System.out.println( "> Split [" + x + "]: " + splits[x].toString() + ", len:" + splits[x].getLength()  );
            
        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while ( custom_reader.next(value)) {
        
    count++;
    //
        
        }
            
        System.out.println( "read: " + count + " records for split " + x );
        
        total_read += count;
            
      } // for each split
          
      System.out.println( "total read across all splits: " + total_read );
        
          
    }
    */

    public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException {

        // InputRecordsSplit(JobConf jobConf, InputSplit split)

        // needs to get a jobConf from somewhere, under the hood

        // needs a split calculated from the aforementioned jobConf

        JobConf job = new JobConf(defaultConf);
        Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset");

        int tmp_file_size = 2000;

        long block_size = localFs.getDefaultBlockSize();

        System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

        Writer writer = new OutputStreamWriter(localFs.create(file));
        try {
            for (int i = 0; i < tmp_file_size; i++) {
                writer.write(
                        "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
                writer.write("\n");
            }
        } finally {
            writer.close();
        }

        System.out.println("file write complete, wrote " + tmp_file_size + " recs");

        // A reporter that does nothing
        Reporter reporter = Reporter.NULL;

        //    localFs.delete(workDir, true);
        FileInputFormat.setInputPaths(job, file);

        // try splitting the file in a variety of sizes
        TextInputFormat format = new TextInputFormat();
        format.configure(job);
        LongWritable key = new LongWritable();
        Text value = new Text();

        int numSplits = 1;

        InputSplit[] splits = format.getSplits(job, numSplits);

        LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

        System.out.println("---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- ");

        int total_read = 0;

        System.out.println("> Split [0]: " + splits[0].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);
        while (custom_reader.next(value)) {

            count++;

        }

        System.out.println("read: " + count + " records for split " + 0);

        int count_reset = 0;
        custom_reader.ResetToStartOfSplit();
        while (custom_reader.next(value)) {

            count_reset++;

        }

        System.out.println("read: " + count_reset + " records for split after reset " + 0);

        assertEquals(count, count_reset);

    }

}