Example usage for org.apache.hadoop.mapred TextInputFormat configure

List of usage examples for org.apache.hadoop.mapred TextInputFormat configure

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred TextInputFormat configure.

Prototype

public void configure(JobConf conf) 

Source Link

Usage

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

/**
 * create an InputRecordSplit and then read some records
 * /*from  w ww.  ja v  a2  s. c om*/
 * - make sure we maintain split discipline
 * @throws IOException 
 * 
 */
public void testReadSplitViaInputRecordsSplit() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    System.out.println("> setting splits for: " + workDir);

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    //InputSplit test_split = null;

    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            count++;
            //

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    System.out.println("--------- total read across all splits: " + total_read);

    assertEquals(tmp_file_size, total_read);

}

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {//from  w w  w.  j av a2  s  .c o  m
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- ");

    int total_read = 0;

    System.out.println("> Split [0]: " + splits[0].getLength());

    int count = 0;
    InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);
    while (custom_reader.next(value)) {

        count++;

    }

    System.out.println("read: " + count + " records for split " + 0);

    int count_reset = 0;
    custom_reader.ResetToStartOfSplit();
    while (custom_reader.next(value)) {

        count_reset++;

    }

    System.out.println("read: " + count_reset + " records for split after reset " + 0);

    assertEquals(count, count_reset);

}

From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

/**
 * /*ww w  .  j a  v a2 s.co  m*/
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
 * 
 */
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));
                }

                count++;
            }

            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {
            reader.close();
        }

    } // for each split

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;/*from  w  w w  .  j  av  a  2s  .c  o  m*/

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;//from   w  w w .  j av  a 2 s  .  com

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    FileInputFormat.setInputPaths(job, input_path);

    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    int numSplits = 1;

    InputSplit[] splits = null;/* w  w  w  .j  a  va  2s . co  m*/

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java

License:Apache License

@Test
public void testRecordFactoryOnDatasetShard() throws Exception {
    // TODO a test with assertions is not a test
    // p.270 ----- metrics to track lucene's parsing mechanics, progress,
    // performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    int k = 0;//from w  ww.  j a  va2 s. c o  m
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    // rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);

    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    @SuppressWarnings("resource")
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    LOG.info("---- debug splits --------- ");
    rec_factory.Debug();
    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        LOG.info("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {
            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            int actual = rec_factory.processLine(value.toString(), v);

            String ng = rec_factory.GetNewsgroupNameByID(actual);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = learningAlgorithm.logLikelihood(actual, v);
            averageLL = averageLL + (ll - averageLL) / mu;

            Vector p = new DenseVector(20);
            learningAlgorithm.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            learningAlgorithm.train(actual, v);
            k++;
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;
                LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL,
                        averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated)));
            }

            learningAlgorithm.close();
            count++;
        }

        LOG.info("read: " + count + " records for split " + x);
        total_read += count;
    } // for each split
    LOG.info("total read across all splits: " + total_read);
    rec_factory.Debug();
}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java

License:Apache License

public void testRecordFactoryOnDatasetShard() throws Exception {

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    //rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "20news-part-0.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;//from w  w  w . j a v a2 s. c  o m

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    rec_factory.Debug();

    int total_read = 0;

    long ts_start = System.currentTimeMillis();

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            rec_factory.processLine(value.toString(), v);

            count++;
            //break;

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    long ts_total = System.currentTimeMillis() - ts_start;

    double vectors_per_sec = (double) total_read / ((double) ts_total / 1000);

    System.out.println("Time: " + ts_total);

    System.out.println("total recs read across all splits: " + total_read);

    System.out.println("Vectors converted / sec: " + vectors_per_sec);

    assertEquals(total_read, 11314);

    rec_factory.Debug();

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    //FileInputFormat.setInputPaths(job, workDir);
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    //LongWritable key = new LongWritable();
    //Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = null;/*from   w w  w. j a v  a2  s  .c o m*/

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.sgd.TestRunPOLRMasterAndSingleWorker.java

License:Apache License

@Test
public void testRunSingleWorkerSingleMaster() throws Exception {
    // TODO a test with assertions is not a test
    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(configuration);//  ww w.  j  a  v a  2  s . c om
    // now load the conf stuff into locally used vars
    master.LoadConfigVarsLocally();
    // now construct any needed machine learning data structures based on config
    master.Setup();
    // ------------------

    POLRWorkerDriver worker_model_builder_0 = new POLRWorkerDriver();

    // simulates the conf stuff
    worker_model_builder_0.setConf(configuration);

    // ---- this all needs to be done in
    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);
    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");
    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, workDir);
    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);

    InputSplit[] splits = format.getSplits(job, 1);

    InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);

    // TODO: set this up to run through the conf pathways
    worker_model_builder_0.setupInputSplit(custom_reader);

    worker_model_builder_0.LoadConfigVarsLocally();

    worker_model_builder_0.Setup();

    LOG.info("> Feature Size: " + worker_model_builder_0.FeatureVectorSize);
    LOG.info("> Category Size: " + worker_model_builder_0.num_categories);

    for (int x = 0; x < 25; x++) {

        worker_model_builder_0.RunNextTrainingBatch();

        GradientUpdateMessage msg = worker_model_builder_0.GenerateUpdateMessage();
        master.AddIncomingGradientMessageToQueue(msg);
        master.RecvGradientMessage(); // process msg
        master.GenerateGlobalUpdateVector();
        GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();
        worker_model_builder_0.ProcessIncomingParameterVectorMessage(returned_msg);
        LOG.info("---------- cycle " + x + " done ------------- ");
    } // for

    worker_model_builder_0.Debug();
}