Example usage for org.apache.hadoop.mapred JobConf JobConf

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf JobConf.

Prototype

public JobConf(boolean loadDefaults)

Source Link

Document

A new map/reduce configuration where the behavior of reading from the default resources can be turned off.

Usage

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

/**
 * create an InputRecordSplit and then read some records
 * /*  ww w .  j  av  a 2s  .co  m*/
 * - make sure we maintain split discipline
 * @throws IOException 
 * 
 */
public void testReadSplitViaInputRecordsSplit() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    System.out.println("> setting splits for: " + workDir);

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    //InputSplit test_split = null;

    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            count++;
            //

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    System.out.println("--------- total read across all splits: " + total_read);

    assertEquals(tmp_file_size, total_read);

}

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {/*from  w w  w  .ja  va 2 s  . c o  m*/
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- ");

    int total_read = 0;

    System.out.println("> Split [0]: " + splits[0].getLength());

    int count = 0;
    InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);
    while (custom_reader.next(value)) {

        count++;

    }

    System.out.println("read: " + count + " records for split " + 0);

    int count_reset = 0;
    custom_reader.ResetToStartOfSplit();
    while (custom_reader.next(value)) {

        count_reset++;

    }

    System.out.println("read: " + count_reset + " records for split after reset " + 0);

    assertEquals(count, count_reset);

}

From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

/**
 * //ww w . j av  a2  s .  c o  m
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
 * 
 */
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));
                }

                count++;
            }

            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {
            reader.close();
        }

    } // for each split

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java

License:Apache License

public void testLoad20NewsModel() throws Exception {

    POLRModelTester tester = new POLRModelTester();

    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {//from   ww  w  .  ja v  a 2s. c o  m
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();
    tester.Load(model20News.toString());

    // ------------------    

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    InputSplit[] splits = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);
    tester.setupInputSplit(custom_reader_0);

    tester.RunThroughTestRecords();

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java

License:Apache License

public void testRunMasterAndTwoWorkers() throws Exception {

    int num_passes = 15;

    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {//w  w  w  .j  a v a  2 s .co  m
        master.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    master.Setup();

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    InputSplit[] splits = generateDebugSplits(workDir, job);

    System.out.println("split count: " + splits.length);

    ArrayList<POLRWorkerDriver> workers = new ArrayList<POLRWorkerDriver>();

    for (int x = 0; x < splits.length; x++) {

        POLRWorkerDriver worker_model_builder = new POLRWorkerDriver(); //workers.get(x);
        worker_model_builder.internalID = String.valueOf(x);
        // simulates the conf stuff
        worker_model_builder.setConf(this.generateDebugConfigurationObject());

        InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[x]);
        // TODO: set this up to run through the conf pathways
        worker_model_builder.setupInputSplit(custom_reader_0);

        worker_model_builder.LoadConfigVarsLocally();
        worker_model_builder.Setup();

        workers.add(worker_model_builder);

        System.out.println("> Setup Worker " + x);

    }

    for (int x = 0; x < num_passes; x++) {

        for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

            workers.get(worker_id).RunNextTrainingBatch();
            GradientUpdateMessage msg0 = workers.get(worker_id).GenerateUpdateMessage();

            master.AddIncomingGradientMessageToQueue(msg0);
            master.RecvGradientMessage(); // process msg

        }

        // TODO: save model to HDFS
        if (x < num_passes - 1) {

            master.GenerateGlobalUpdateVector();

            GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();

            // process global updates
            for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

                workers.get(worker_id).ProcessIncomingParameterVectorMessage(returned_msg);

            }

            System.out.println("---------- cycle " + x + " done ------------- ");

        } else {

            System.out.println("---------- cycle " + x + " done ------------- ");
            System.out.println("> Saving Model...");

            master.SaveModelLocally("/tmp/TestRunPOLRMasterAndNWorkers.20news.model");

        } // if        

    } // for

    POLRModelTester tester = new POLRModelTester();

    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();

    tester.SetCore(master.polr, master.polr_modelparams, workers.get(0).getRecordFactory());

    //fullRCV1Dir
    InputSplit[] splits_test_data = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits_test_data.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits_test_data[0]);

    // TODO: set this up to run through the conf pathways
    tester.setupInputSplit(custom_reader_0);

    tester.RunThroughTestRecords();

}

From source file:com.cloudera.knittingboar.metrics.TestRCV1ApplyModel.java

License:Apache License

public void testApplyModel() throws Exception {

    POLRModelTester tester = new POLRModelTester();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {/*  www  .j  a v  a2s  .  com*/
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();
    tester.Load("/tmp/master_sgd.model");
    // ------------------    

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    InputSplit[] splits = generateDebugSplits(fullRCV1TestFile, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);
    // TODO: set this up to run through the conf pathways
    tester.setupInputSplit(custom_reader_0);
    tester.RunThroughTestRecords();

}

From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java

License:Apache License

public void testRunMasterAndTwoWorkers() throws Exception {

    int num_passes = 15;

    POLRMasterDriver master = new POLRMasterDriver();
    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    master.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {//from ww w .  ja va  2  s  . c  o m
        master.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    master.Setup();
    // ------------------    

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    InputSplit[] splits = generateDebugSplits(workDir, job);

    System.out.println("split count: " + splits.length);

    ArrayList<POLRWorkerDriver> workers = new ArrayList<POLRWorkerDriver>();

    for (int x = 0; x < splits.length; x++) {

        POLRWorkerDriver worker_model_builder = new POLRWorkerDriver(); //workers.get(x);
        worker_model_builder.internalID = String.valueOf(x);
        // simulates the conf stuff
        worker_model_builder.setConf(this.generateDebugConfigurationObject());

        InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[x]);
        // TODO: set this up to run through the conf pathways
        worker_model_builder.setupInputSplit(custom_reader_0);

        worker_model_builder.LoadConfigVarsLocally();
        worker_model_builder.Setup();

        workers.add(worker_model_builder);

        System.out.println("> Setup Worker " + x);

    }

    for (int x = 0; x < num_passes; x++) {

        for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

            workers.get(worker_id).RunNextTrainingBatch();
            GradientUpdateMessage msg0 = workers.get(worker_id).GenerateUpdateMessage();

            master.AddIncomingGradientMessageToQueue(msg0);
            master.RecvGradientMessage(); // process msg

        }

        // TODO: save model to HDFS
        if (x < num_passes - 1) {

            master.GenerateGlobalUpdateVector();

            GlobalParameterVectorUpdateMessage returned_msg = master.GetNextGlobalUpdateMsgFromQueue();

            // process global updates
            for (int worker_id = 0; worker_id < workers.size(); worker_id++) {

                workers.get(worker_id).ProcessIncomingParameterVectorMessage(returned_msg);

            }

            System.out.println("---------- cycle " + x + " done ------------- ");

        } else {

            System.out.println("---------- cycle " + x + " done ------------- ");
            System.out.println("> Saving Model...");

            //master.Debug();

            master.SaveModelLocally("/tmp/TestRunPOLRMasterAndNWorkers.20news.model");

        } // if        
    } // for

    System.out.println("\n\n> Loading Model for tests...");

    POLRModelTester tester = new POLRModelTester();

    // ------------------    
    // generate the debug conf ---- normally setup by YARN stuff
    tester.setConf(this.generateDebugConfigurationObject());
    // now load the conf stuff into locally used vars
    try {
        tester.LoadConfigVarsLocally();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
        System.out.println("Conf load fail: shutting down.");
        assertEquals(0, 1);
    }
    // now construct any needed machine learning data structures based on config
    tester.Setup();
    tester.Load("/tmp/TestRunPOLRMasterAndNWorkers.20news.model");

    assertEquals(1.0e-4, tester.polr.getLambda());

    assertEquals(20, tester.polr.numCategories());

}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java

License:Apache License

@Test
public void testRecordFactoryOnDatasetShard() throws Exception {
    // TODO a test with assertions is not a test
    // p.270 ----- metrics to track lucene's parsing mechanics, progress,
    // performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    int k = 0;/*from  w w w  .  j ava2 s . c  o  m*/
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    // rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);

    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    @SuppressWarnings("resource")
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    LOG.info("---- debug splits --------- ");
    rec_factory.Debug();
    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        LOG.info("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {
            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            int actual = rec_factory.processLine(value.toString(), v);

            String ng = rec_factory.GetNewsgroupNameByID(actual);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = learningAlgorithm.logLikelihood(actual, v);
            averageLL = averageLL + (ll - averageLL) / mu;

            Vector p = new DenseVector(20);
            learningAlgorithm.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            learningAlgorithm.train(actual, v);
            k++;
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;
                LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL,
                        averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated)));
            }

            learningAlgorithm.close();
            count++;
        }

        LOG.info("read: " + count + " records for split " + x);
        total_read += count;
    } // for each split
    LOG.info("total read across all splits: " + total_read);
    rec_factory.Debug();
}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsRecordFactory.java

License:Apache License

public void testRecordFactoryOnDatasetShard() throws Exception {

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    //rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "20news-part-0.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;/*  w  w  w.j  av a 2s . co m*/

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    rec_factory.Debug();

    int total_read = 0;

    long ts_start = System.currentTimeMillis();

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            rec_factory.processLine(value.toString(), v);

            count++;
            //break;

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    long ts_total = System.currentTimeMillis() - ts_start;

    double vectors_per_sec = (double) total_read / ((double) ts_total / 1000);

    System.out.println("Time: " + ts_total);

    System.out.println("total recs read across all splits: " + total_read);

    System.out.println("Vectors converted / sec: " + vectors_per_sec);

    assertEquals(total_read, 11314);

    rec_factory.Debug();

}

From source file:com.cloudera.knittingboar.sgd.olr.TestBaseOLRTest20Newsgroups.java

License:Apache License

public void testResults() throws Exception {

    OnlineLogisticRegression classifier = ModelSerializer
            .readBinary(new FileInputStream(model20News.toString()), OnlineLogisticRegression.class);

    Text value = new Text();
    long batch_vec_factory_time = 0;
    int k = 0;//w ww  .j  a  va2 s  .c o m
    int num_correct = 0;

    // ---- this all needs to be done in 
    JobConf job = new JobConf(defaultConf);

    // TODO: work on this, splits are generating for everything in dir
    //    InputSplit[] splits = generateDebugSplits(inputDir, job);

    //fullRCV1Dir
    InputSplit[] splits = generateDebugSplits(testData20News, job);

    System.out.println("split count: " + splits.length);

    InputRecordsSplit custom_reader_0 = new InputRecordsSplit(job, splits[0]);

    TwentyNewsgroupsRecordFactory VectorFactory = new TwentyNewsgroupsRecordFactory("\t");

    for (int x = 0; x < 8000; x++) {

        if (custom_reader_0.next(value)) {

            long startTime = System.currentTimeMillis();

            Vector v = new RandomAccessSparseVector(FEATURES);
            int actual = VectorFactory.processLine(value.toString(), v);

            long endTime = System.currentTimeMillis();

            //System.out.println("That took " + (endTime - startTime) + " milliseconds");
            batch_vec_factory_time += (endTime - startTime);

            String ng = VectorFactory.GetClassnameByID(actual); //.GetNewsgroupNameByID( actual );

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = classifier.logLikelihood(actual, v);
            //averageLL = averageLL + (ll - averageLL) / mu;
            metrics.AvgLogLikelihood = metrics.AvgLogLikelihood + (ll - metrics.AvgLogLikelihood) / mu;

            Vector p = new DenseVector(20);
            classifier.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            if (estimated == actual) {
                num_correct++;
            }
            //averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            metrics.AvgCorrect = metrics.AvgCorrect + (correct - metrics.AvgCorrect) / mu;

            //this.polr.train(actual, v);

            k++;
            //        if (x == this.BatchSize - 1) {
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;

                System.out.printf(
                        "Worker %s:\t Tested Recs: %10d, numCorrect: %d, AvgLL: %10.3f, Percent Correct: %10.2f, VF: %d\n",
                        "OLR-standard-test", k, num_correct, metrics.AvgLogLikelihood, metrics.AvgCorrect * 100,
                        batch_vec_factory_time);

            }

            classifier.close();

        } else {

            // nothing else to process in split!
            break;

        } // if

    } // for the number of passes in the run    

}