Example usage for org.apache.hadoop.mapred TextInputFormat TextInputFormat

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred TextInputFormat TextInputFormat.

Prototype

TextInputFormat

Source Link

Usage

From source file:RunText.java

License:Apache License

public static void main(String[] args) throws Exception {
    o = new Options();
    JCommander jc = null;/*from w w  w .  j a  v a  2s .  c om*/
    try {
        jc = new JCommander(o, args);
        jc.setProgramName("./runText");
    } catch (ParameterException e) {
        System.out.println(e.getMessage());
        String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" };
        new JCommander(o, valid).usage();
        System.exit(-1);
    }
    if (o.help) {
        jc.usage();
        System.exit(0);
    }
    path = new Path(o.path);
    delim = o.delimiter.getBytes()[0];
    toFind = o.value;
    index = o.index;
    numThreads = o.threads;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    TextInputFormat format = new TextInputFormat();
    long len = fs.getFileStatus(path).getLen() / numThreads;

    List<Thread> threads = Lists.newArrayList();

    for (int i = 0; i < numThreads; i++) {
        FileSplit split = new FileSplit(path, i * len, len, new String[] { "" });
        threads.add(new Thread(new RunText(split, format)));
    }

    runningThreads = new AtomicInteger(numThreads);

    for (Thread t : threads) {
        t.start();
    }

    int prev = 0;
    int current;
    long t1 = System.nanoTime();
    long t2;
    while (runningThreads.get() > 0) {
        Thread.sleep(5000);
        current = totalCount.get();
        t2 = System.nanoTime();
        System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1)));
        t1 = t2;
        prev = current;
    }

    for (Thread t : threads) {
        t.join();
    }

    fs.close();
}

From source file:TestTextInputFormat.java

License:Open Source License

public static void main(String[] argv) throws IOException, SerDeException {
    try {//from  ww w. j  a  va 2  s  .c o m
        if (argv.length != 2) {
            System.out.println("TestTextInputFormat <input> <output>");
            System.exit(-1);
        }

        JobConf conf = new JobConf(TestTextInputFormat.class);

        conf.setJobName("TestTextInputFormat");

        conf.setNumMapTasks(1);
        conf.setNumReduceTasks(1);

        conf.setOutputKeyClass(LongWritable.class);
        conf.setOutputValueClass(Unit.Record.class);

        conf.setInputFormat(TextInputFormat.class);
        conf.setOutputFormat(FormatStorageOutputFormat.class);
        conf.set("mapred.output.compress", "flase");

        conf.set("mapred.input.dir", argv[0]);

        LazySimpleSerDe serDe = initSerDe(conf);
        LazySimpleStructObjectInspector oi = (LazySimpleStructObjectInspector) serDe.getObjectInspector();
        List<? extends StructField> fieldRefs = oi.getAllStructFieldRefs();

        FileInputFormat.setInputPaths(conf, argv[0]);
        Path outputPath = new Path(argv[1]);
        FileOutputFormat.setOutputPath(conf, outputPath);

        InputFormat inputFormat = new TextInputFormat();
        ((TextInputFormat) inputFormat).configure(conf);
        InputSplit[] inputSplits = inputFormat.getSplits(conf, 1);
        if (inputSplits.length == 0) {
            System.out.println("inputSplits is empty");
            return;
        } else {
            System.out.println("get Splits:" + inputSplits.length);
        }

        int totalDelay = 0;
        RecordReader<WritableComparable, Writable> currRecReader = null;
        for (int i = 0; i < inputSplits.length; i++) {
            currRecReader = inputFormat.getRecordReader(inputSplits[i], conf, Reporter.NULL);

            WritableComparable key;
            Writable value;

            key = currRecReader.createKey();
            value = currRecReader.createValue();

            long begin = System.currentTimeMillis();
            int count = 0;
            while (currRecReader.next(key, value)) {

                Object row = serDe.deserialize((Text) value);
                oi.getStructFieldsDataAsList(row);

                count++;
            }
            long end = System.currentTimeMillis();

            long delay = (end - begin) / 1000;
            totalDelay += delay;
            System.out.println(count + " record read over, delay " + delay + " s");
        }

        System.out.println("total delay:" + totalDelay);

        return;
    } catch (Exception e) {
        e.printStackTrace();
        System.out.println("get exception:" + e.getMessage());
    }
}

From source file:com.cloudera.knittingboar.io.InputRecordsSplit.java

License:Apache License

public InputRecordsSplit(JobConf jobConf, InputSplit split) throws IOException {

    this.jobConf = jobConf;
    this.split = split;
    this.input_format = new TextInputFormat();

    // RecordReader<LongWritable, Text> reader =
    // format.getRecordReader(splits[x], job, reporter);
    this.reader = input_format.getRecordReader(this.split, this.jobConf, voidReporter);
    this.key = reader.createKey();
    // Text value = reader.createValue();

}

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

/**
 * create an InputRecordSplit and then read some records
 * /*from   www.j  a va2 s.co m*/
 * - make sure we maintain split discipline
 * @throws IOException 
 * 
 */
public void testReadSplitViaInputRecordsSplit() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit.txt");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    System.out.println("> setting splits for: " + workDir);

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- debug splits --------- ");

    //InputSplit test_split = null;

    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {

            count++;
            //

        }

        System.out.println("read: " + count + " records for split " + x);

        total_read += count;

    } // for each split

    System.out.println("--------- total read across all splits: " + total_read);

    assertEquals(tmp_file_size, total_read);

}

From source file:com.cloudera.knittingboar.io.TestInputRecordsSplit.java

License:Apache License

public void testReadSplitViaInputRecordsSplit_SplitReset() throws IOException {

    // InputRecordsSplit(JobConf jobConf, InputSplit split)

    // needs to get a jobConf from somewhere, under the hood

    // needs a split calculated from the aforementioned jobConf

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testReadSplitViaInputRecordsSplit_SplitReset");

    int tmp_file_size = 2000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {//  w ww .j ava 2 s .  co m
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete, wrote " + tmp_file_size + " recs");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    System.out.println("---- testReadSplitViaInputRecordsSplit_SplitReset: debug splits --------- ");

    int total_read = 0;

    System.out.println("> Split [0]: " + splits[0].getLength());

    int count = 0;
    InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[0]);
    while (custom_reader.next(value)) {

        count++;

    }

    System.out.println("read: " + count + " records for split " + 0);

    int count_reset = 0;
    custom_reader.ResetToStartOfSplit();
    while (custom_reader.next(value)) {

        count_reset++;

    }

    System.out.println("read: " + count_reset + " records for split after reset " + 0);

    assertEquals(count, count_reset);

}

From source file:com.cloudera.knittingboar.io.TestSplitCalcs.java

License:Apache License

/**
 * /*from w w  w  .  j a v  a 2s.  c  om*/
 * - use the TextInputFormat.getSplits() to test pulling split info
 * @throws IOException 
 * 
 */
public void testGetSplits() throws IOException {

    TextInputFormat input = new TextInputFormat();

    JobConf job = new JobConf(defaultConf);
    Path file = new Path(workDir, "testGetSplits.txt");

    int tmp_file_size = 200000;

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    Writer writer = new OutputStreamWriter(localFs.create(file));
    try {
        for (int i = 0; i < tmp_file_size; i++) {
            writer.write(
                    "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 1, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, 99");
            writer.write("\n");
        }
    } finally {
        writer.close();
    }

    System.out.println("file write complete");

    // A reporter that does nothing
    Reporter reporter = Reporter.NULL;

    //    localFs.delete(workDir, true);
    FileInputFormat.setInputPaths(job, file);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    LongWritable key = new LongWritable();
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);

    assertEquals(2, splits.length);

    System.out.println("---- debug splits --------- ");

    for (int x = 0; x < splits.length; x++) {

        System.out.println("> Split [" + x + "]: " + splits[x].getLength() + ", " + splits[x].toString() + ", "
                + splits[x].getLocations()[0]);

        RecordReader<LongWritable, Text> reader = format.getRecordReader(splits[x], job, reporter);
        try {
            int count = 0;
            while (reader.next(key, value)) {

                if (count == 0) {
                    System.out.println("first: " + value.toString());
                    assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));
                }

                count++;
            }

            System.out.println("last: " + value.toString());

            assertTrue(value.toString().contains("a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p"));

        } finally {
            reader.close();
        }

    } // for each split

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsApplyModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    // ---- set where we'll read the input files from -------------
    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);//from  w w  w.j  av  a 2s  .c o m

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.metrics.Test20NewsNoSaveModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    FileInputFormat.setInputPaths(job, input_path);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);//w  w w. j a  va  2s  .com

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.metrics.TestSaveLoadModel.java

License:Apache License

public InputSplit[] generateDebugSplits(Path input_path, JobConf job) {

    long block_size = localFs.getDefaultBlockSize();

    System.out.println("default block size: " + (block_size / 1024 / 1024) + "MB");

    FileInputFormat.setInputPaths(job, input_path);

    TextInputFormat format = new TextInputFormat();
    format.configure(job);//  w  w w.ja v  a  2  s.c o m

    int numSplits = 1;

    InputSplit[] splits = null;

    try {
        splits = format.getSplits(job, numSplits);
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    return splits;

}

From source file:com.cloudera.knittingboar.records.TestTwentyNewsgroupsCustomRecordParseOLRRun.java

License:Apache License

@Test
public void testRecordFactoryOnDatasetShard() throws Exception {
    // TODO a test with assertions is not a test
    // p.270 ----- metrics to track lucene's parsing mechanics, progress,
    // performance of OLR ------------
    double averageLL = 0.0;
    double averageCorrect = 0.0;
    int k = 0;//  ww w. jav  a2s  .  c  o m
    double step = 0.0;
    int[] bumps = new int[] { 1, 2, 5 };

    TwentyNewsgroupsRecordFactory rec_factory = new TwentyNewsgroupsRecordFactory("\t");
    // rec_factory.setClassSplitString("\t");

    JobConf job = new JobConf(defaultConf);

    long block_size = localFs.getDefaultBlockSize(workDir);

    LOG.info("default block size: " + (block_size / 1024 / 1024) + "MB");

    // matches the OLR setup on p.269 ---------------
    // stepOffset, decay, and alpha --- describe how the learning rate decreases
    // lambda: amount of regularization
    // learningRate: amount of initial learning rate
    @SuppressWarnings("resource")
    OnlineLogisticRegression learningAlgorithm = new OnlineLogisticRegression(20, FEATURES, new L1()).alpha(1)
            .stepOffset(1000).decayExponent(0.9).lambda(3.0e-5).learningRate(20);

    FileInputFormat.setInputPaths(job, workDir);

    // try splitting the file in a variety of sizes
    TextInputFormat format = new TextInputFormat();
    format.configure(job);
    Text value = new Text();

    int numSplits = 1;

    InputSplit[] splits = format.getSplits(job, numSplits);

    LOG.info("requested " + numSplits + " splits, splitting: got =        " + splits.length);
    LOG.info("---- debug splits --------- ");
    rec_factory.Debug();
    int total_read = 0;

    for (int x = 0; x < splits.length; x++) {

        LOG.info("> Split [" + x + "]: " + splits[x].getLength());

        int count = 0;
        InputRecordsSplit custom_reader = new InputRecordsSplit(job, splits[x]);
        while (custom_reader.next(value)) {
            Vector v = new RandomAccessSparseVector(TwentyNewsgroupsRecordFactory.FEATURES);
            int actual = rec_factory.processLine(value.toString(), v);

            String ng = rec_factory.GetNewsgroupNameByID(actual);

            // calc stats ---------

            double mu = Math.min(k + 1, 200);
            double ll = learningAlgorithm.logLikelihood(actual, v);
            averageLL = averageLL + (ll - averageLL) / mu;

            Vector p = new DenseVector(20);
            learningAlgorithm.classifyFull(p, v);
            int estimated = p.maxValueIndex();

            int correct = (estimated == actual ? 1 : 0);
            averageCorrect = averageCorrect + (correct - averageCorrect) / mu;
            learningAlgorithm.train(actual, v);
            k++;
            int bump = bumps[(int) Math.floor(step) % bumps.length];
            int scale = (int) Math.pow(10, Math.floor(step / bumps.length));

            if (k % (bump * scale) == 0) {
                step += 0.25;
                LOG.info(String.format("%10d %10.3f %10.3f %10.2f %s %s", k, ll, averageLL,
                        averageCorrect * 100, ng, rec_factory.GetNewsgroupNameByID(estimated)));
            }

            learningAlgorithm.close();
            count++;
        }

        LOG.info("read: " + count + " records for split " + x);
        total_read += count;
    } // for each split
    LOG.info("total read across all splits: " + total_read);
    rec_factory.Debug();
}