Example usage for org.apache.hadoop.mapred JobConf JobConf

List of usage examples for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults) 

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

/**
 * create an InputRecordSplit and then read some records
 * /*  ww w .  j  av  a 2s  .co  m*/
 * - make sure we maintain split discipline
 * @throws IOException 
 * 
 */
public void testReadSplitViaInputRecordsSplit() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    System.out.println("> setting splits for: " + workDir);

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    //InputSplit test_split = null;

    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            count++;
            //

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    System.out.println("--------- total read across all splits: " + total_read);

    assertEquals(tmp_file_size, total_read);

}

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {/*from  w w  w  .ja  va 2 s  . c o  m*/
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- ");

    int total_read = 0;

    System.out.println("> Split [0]: " + splits[0].getLength());

    int count = 0;
    InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);
    while (custom_reader.next(value)) {

        count++;

    }

    System.out.println("read: " + count + " records for split " + 0);

    int count_reset = 0;
    custom_reader.ResetToStartOfSplit();
    while (custom_reader.next(value)) {

        count_reset++;

    }

    System.out.println("read: " + count_reset + " records for split after reset " + 0);

    assertEquals(count, count_reset);

}

From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

/**
 * //ww w . j av  a2  s .  c o  m
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
 * 
 */
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));
                }

                count++;
            }

            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {
            reader.close();
        }

    } // for each split

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java

License:Apache License

public void testLoad20NewsModel() throws Exception {

    POLRModelTester tester = new POLRModelTester();

    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {//from   ww  w  .  ja v  a 2s. c o  m
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();
    tester.Load(model20News.toString());

    // ------------------    

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    InputSplit[] splits = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);
    tester.setupInputSplit(custom_reader_0);

    tester.RunThroughTestRecords();

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java

License:Apache License

public void testRunMasterAndTwoWorkers() throws Exception {

    int num_passes = 15;

    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {//w  w  w  .j  a v a  2 s .co  m
        master.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    master.Setup();

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    InputSplit[] splits = generateDebugSplits(workDir, job);

    System.out.println("split count: " + splits.length);

    ArrayList<POLRWorkerDriver> workers = new ArrayList<POLRWorkerDriver>();

    for (int x = 0; x < splits.length; x++) {

        POLRWorkerDriver worker_model_builder = new POLRWorkerDriver(); //workers.get(x);
        worker_model_builder.internalID = String.valueOf(x);
        // simulates the conf stuff
        worker_model_builder.setConf(this.generateDebugConfigurationObject());

        InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[x]);
        // TODO: set this up to run through the conf pathways
        worker_model_builder.setupInputSplit(custom_reader_0);

        worker_model_builder.LoadConfigVarsLocally();
        worker_model_builder.Setup();

        workers.add(worker_model_builder);

        System.out.println("> Setup Worker " + x);

    }

    for (int x = 0; x < num_passes; x++) {

        for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

            workers.get(worker_id).RunNextTrainingBatch();
            GradientUpdateMessage msg0 = workers.get(worker_id).GenerateUpdateMessage();

            master.AddIncomingGradientMessageToQueue(msg0);
            master.RecvGradientMessage(); // process msg

        }

        // TODO: save model to HDFS
        if (x < num_passes - 1) {

            master.GenerateGlobalUpdateVector();

            GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();

            // process global updates
            for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

                workers.get(worker_id).ProcessIncomingParameterVectorMessage(returned_msg);

            }

            System.out.println("---------- cycle " + x + " done ------------- ");

        } else {

            System.out.println("---------- cycle " + x + " done ------------- ");
            System.out.println("> Saving Model...");

            master.SaveModelLocally("/tmp/TestRunPOLRMasterAndNWorkers.20news.model");

        } // if        

    } // for

    POLRModelTester tester = new POLRModelTester();

    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();

    tester.SetCore(master.polr, master.polr_modelparams, workers.get(0).getRecordFactory());

    //fullRCV1Dir
    InputSplit[] splits_test_data = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits_test_data.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits_test_data[0]);

    // TODO: set this up to run through the conf pathways
    tester.setupInputSplit(custom_reader_0);

    tester.RunThroughTestRecords();

}

From source file:com.cloudera.knittingboar.metrics.TestRCV1ApplyModel.java

License:Apache License

public void testApplyModel() throws Exception {

    POLRModelTester tester = new POLRModelTester();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {/*  www  .j  a v  a2s  .  com*/
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();
    tester.Load("/tmp/master_sgd.model");
    // ------------------    

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    InputSplit[] splits = generateDebugSplits(fullRCV1TestFile, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);
    // TODO: set this up to run through the conf pathways
    tester.setupInputSplit(custom_reader_0);
    tester.RunThroughTestRecords();

}

From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java

License:Apache License

public void testRunMasterAndTwoWorkers() throws Exception {

    int num_passes = 15;

    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {//from ww w .  ja va  2  s  . c  o m
        master.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    master.Setup();
    // ------------------    

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    InputSplit[] splits = generateDebugSplits(workDir, job);

    System.out.println("split count: " + splits.length);

    ArrayList<POLRWorkerDriver> workers = new ArrayList<POLRWorkerDriver>();

    for (int x = 0; x < splits.length; x++) {

        POLRWorkerDriver worker_model_builder = new POLRWorkerDriver(); //workers.get(x);
        worker_model_builder.internalID = String.valueOf(x);
        // simulates the conf stuff
        worker_model_builder.setConf(this.generateDebugConfigurationObject());

        InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[x]);
        // TODO: set this up to run through the conf pathways
        worker_model_builder.setupInputSplit(custom_reader_0);

        worker_model_builder.LoadConfigVarsLocally();
        worker_model_builder.Setup();

        workers.add(worker_model_builder);

        System.out.println("> Setup Worker " + x);

    }

    for (int x = 0; x < num_passes; x++) {

        for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

            workers.get(worker_id).RunNextTrainingBatch();
            GradientUpdateMessage msg0 = workers.get(worker_id).GenerateUpdateMessage();

            master.AddIncomingGradientMessageToQueue(msg0);
            master.RecvGradientMessage(); // process msg

        }

        // TODO: save model to HDFS
        if (x < num_passes - 1) {

            master.GenerateGlobalUpdateVector();

            GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();

            // process global updates
            for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

                workers.get(worker_id).ProcessIncomingParameterVectorMessage(returned_msg);

            }

            System.out.println("---------- cycle " + x + " done ------------- ");

        } else {

            System.out.println("---------- cycle " + x + " done ------------- ");
            System.out.println("> Saving Model...");

            //master.Debug();

            master.SaveModelLocally("/tmp/TestRunPOLRMasterAndNWorkers.20news.model");

        } // if        
    } // for

    System.out.println("\n\n> Loading Model for tests...");

    POLRModelTester tester = new POLRModelTester();

    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();
    tester.Load("/tmp/TestRunPOLRMasterAndNWorkers.20news.model");

    assertEquals(1.0e-4, tester.polr.getLambda());

    assertEquals(20, tester.polr.numCategories());

}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java

License:Apache License

@Test
public void testRecordFactoryOnDatasetShard() throws Exception {
    // TODO a test with assertions is not a test
    // p.270 ----- metrics to track lucene's parsing mechanics, progress,
    // performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    int k = 0;/*from  w w w  .  j ava2 s . c  o  m*/
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    // rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);

    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    @SuppressWarnings("resource")
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    LOG.info("---- debug splits --------- ");
    rec_factory.Debug();
    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        LOG.info("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {
            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            int actual = rec_factory.processLine(value.toString(), v);

            String ng = rec_factory.GetNewsgroupNameByID(actual);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = learningAlgorithm.logLikelihood(actual, v);
            averageLL = averageLL + (ll - averageLL) / mu;

            Vector p = new DenseVector(20);
            learningAlgorithm.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            learningAlgorithm.train(actual, v);
            k++;
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;
                LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL,
                        averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated)));
            }

            learningAlgorithm.close();
            count++;
        }

        LOG.info("read: " + count + " records for split " + x);
        total_read += count;
    } // for each split
    LOG.info("total read across all splits: " + total_read);
    rec_factory.Debug();
}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java

License:Apache License

public void testRecordFactoryOnDatasetShard() throws Exception {

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    //rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "20news-part-0.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;/*  w  w  w.j  av a 2s . co m*/

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    rec_factory.Debug();

    int total_read = 0;

    long ts_start = System.currentTimeMillis();

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            rec_factory.processLine(value.toString(), v);

            count++;
            //break;

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    long ts_total = System.currentTimeMillis() - ts_start;

    double vectors_per_sec = (double) total_read / ((double) ts_total / 1000);

    System.out.println("Time: " + ts_total);

    System.out.println("total recs read across all splits: " + total_read);

    System.out.println("Vectors converted / sec: " + vectors_per_sec);

    assertEquals(total_read, 11314);

    rec_factory.Debug();

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java

License:Apache License

public void testResults() throws Exception {

    OnlineLogisticRegression classifier = ModelSerializer
            .readBinary(new FileInputStream(model20News.toString()), OnlineLogisticRegression.class);

    Text value = new Text();
    long batch_vec_factory_time = 0;
    int k = 0;//w ww  .j  a  va2 s  .c o m
    int num_correct = 0;

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    //    InputSplit[] splits = generateDebugSplits(inputDir, job);

    //fullRCV1Dir
    InputSplit[] splits = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);

    TwentyNewsgroupsRecordFactory VectorFactory = new TwentyNewsgroupsRecordFactory("\t");

    for (int x = 0; x < 8000; x++) {

        if (custom_reader_0.next(value)) {

            long startTime = System.currentTimeMillis();

            Vector v = new RandomAccessSparseVector(FEATURES);
            int actual = VectorFactory.processLine(value.toString(), v);

            long endTime = System.currentTimeMillis();

            //System.out.println("That took " + (endTime - startTime) + " milliseconds");
            batch_vec_factory_time += (endTime - startTime);

            String ng = VectorFactory.GetClassnameByID(actual); //.GetNewsgroupNameByID( actual );

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = classifier.logLikelihood(actual, v);
            //averageLL = averageLL + (ll - averageLL) / mu;
            metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu;

            Vector p = new DenseVector(20);
            classifier.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            if (estimated == actual) {
                num_correct++;
            }
            //averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu;

            //this.polr.train(actual, v);

            k++;
            //        if (x == this.BatchSize - 1) {
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;

                System.out.printf(
                        "Worker %s:\t Tested Recs: %10d, numCorrect: %d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
                        "OLR-standard-test", k, num_correct, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100,
                        batch_vec_factory_time);

            }

            classifier.close();

        } else {

            // nothing else to process in split!
            break;

        } // if

    } // for the number of passes in the run    

}