Example usage for org.apache.hadoop.mapreduce.lib.input TextInputFormat createRecordReader

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input TextInputFormat createRecordReader.

Prototype

@Override
    public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)

Source Link

Usage

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize)
        throws IOException, InterruptedException {
    TextInputFormat textInputFormat = new TextInputFormat();
    InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null);
    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf,
            TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0"));
    RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit,
            taskAttemptContext);//ww w .ja va2  s . c o m
    recordReader.initialize(fileSplit, taskAttemptContext);
    boolean hasNext = recordReader.nextKeyValue();
    List<Map.Entry> batch = new ArrayList<>();
    while (hasNext && batch.size() < batchSize) {
        batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(),
                String.valueOf(recordReader.getCurrentValue())));
        hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances
    }
    return batch;
}

From source file:org.apache.mahout.classifier.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);//  w  w w.  j a v a  2s. com
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    /* first instance id in hadoop's order */
    //int[] firstIds = new int[nbSplits];
    /* partitions' sizes in hadoop order */
    int[] sizes = new int[nbSplits];

    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
        reader.initialize(split, task);

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees);

        long time = System.currentTimeMillis();

        //firstIds[hp] = firstId;

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
            firstId++;
            sizes[hp]++;
        }

        mapper.cleanup(firstOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
    return true;
}

From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);/*w ww .j  a  va2s. c  o  m*/
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    firstIds = new int[nbSplits];
    sizes = new int[nbSplits];

    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (int p = 0; p < nbSplits; p++) {
        InputSplit split = splits.get(p);
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
        reader.initialize(split, task);

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees);

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
            firstId++;
            sizes[hp]++;
        }

        mapper.cleanup(firstOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
    return true;
}

From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

/**
 * The second step uses the trees to predict the rest of the instances outside
 * their own partition/*  w ww  . j ava  2  s  .  c o m*/
 */
protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback)
        throws IOException, InterruptedException {
    JobContext jobContext = new JobContext(conf, new JobID());

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(jobContext);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < nbSplits; p++) {
        total += Step2Mapper.nbConcerned(nbSplits, numTrees, p);
    }

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees);
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < nbSplits; partition++) {

        InputSplit split = sorted[partition];
        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(conf);
        int numInstances = InterResults.load(fs, forestPath, nbSplits, numTrees, partition, fsKeys, fsTrees);

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), secondOutput);
        }

        mapper.cleanup(secondOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java

License:Apache License

public void testStep0Mapper() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(NUM_MAPS, splits.size());

    InputSplit[] sorted = new InputSplit[NUM_MAPS];
    splits.toArray(sorted);/*from  w  w  w  .j a v  a2 s. c om*/
    Builder.sortSplits(sorted);

    Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(),
            NUM_MAPS);

    for (int p = 0; p < NUM_MAPS; p++) {
        InputSplit split = sorted[p];

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context);
        reader.initialize(split, context);

        Step0Mapper mapper = new Step0Mapper();
        mapper.configure(p);

        Long firstKey = null;
        int size = 0;

        while (reader.nextKeyValue()) {
            LongWritable key = reader.getCurrentKey();

            if (firstKey == null) {
                firstKey = key.get();
            }

            mapper.map(key, reader.getCurrentValue(), context);

            size++;
        }

        mapper.cleanup(context);

        // validate the mapper's output
        assertEquals(p, context.keys[p]);
        assertEquals(firstKey.longValue(), context.values[p].getFirstId());
        assertEquals(size, context.values[p].getSize());
    }

}

From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java

License:Apache License

public void testProcessOutput() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES);

    // each instance label is its index in the dataset
    int labelId = Utils.findLabel(descriptor);
    for (int index = 0; index < NUM_INSTANCES; index++) {
        source[index][labelId] = index;//from   ww  w.j a v a2  s  .c o  m
    }

    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(NUM_MAPS, splits.size());

    InputSplit[] sorted = new InputSplit[NUM_MAPS];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    List<Integer> keys = new ArrayList<Integer>();
    List<Step0Output> values = new ArrayList<Step0Output>();

    int[] expectedIds = new int[NUM_MAPS];

    TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID());

    for (int p = 0; p < NUM_MAPS; p++) {
        InputSplit split = sorted[p];
        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context);
        reader.initialize(split, context);

        Long firstKey = null;
        int size = 0;

        while (reader.nextKeyValue()) {
            LongWritable key = reader.getCurrentKey();
            Text value = reader.getCurrentValue();

            if (firstKey == null) {
                firstKey = key.get();
                expectedIds[p] = converter.convert(0, value.toString()).getLabel();
            }

            size++;
        }

        keys.add(p);
        values.add(new Step0Output(firstKey, size));
    }

    Step0Output[] partitions = Step0Job.processOutput(keys, values);

    int[] actualIds = Step0Output.extractFirstIds(partitions);

    assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds),
            Arrays.equals(expectedIds, actualIds));
}

From source file:org.mrgeo.format.CsvInputFormatTest.java

License:Apache License

@Test
@Category(UnitTest.class)
public void testNullIgnore() throws Exception {
    FileSystem fs = new RawLocalFileSystem();
    try {//from   w ww . j  a v a2s  .c o m
        int lineCount = 0;

        // Write columns file which defines the columns title and type
        String cstr = "<?xml version='1.0' encoding='UTF-8'?>\n<AllColumns firstLineHeader='false'>\n";
        cstr += "  <Column name='name' type='Nominal'/>\n";
        cstr += "  <Column name='x' type='Numeric'/>\n";
        cstr += "  <Column name='y' type='Numeric'/>\n";
        cstr += "</AllColumns>\n";
        FileOutputStream fos = new FileOutputStream(output + "/nulXY.csv.columns");
        PrintStream ps = new PrintStream(fos);
        ps.print(cstr);
        ps.close();

        // Write csv test data
        fos = new FileOutputStream(output + "/nullXY.csv");
        ps = new PrintStream(fos);
        // populated rows
        for (int ii = 0; ii < 10; ii++) {
            ps.print("ASDF,1.0,1.0\n");
            lineCount++;
        }
        // empty rows
        ps.print("ASDF,,1.0\n");
        ps.print("ASDF,1.0,\n");
        ps.print("ASDF,,\n");
        lineCount += 3;
        // populated rows
        for (int ii = 0; ii < 5; ii++) {
            ps.print("ASDF,1.0,1.0\n");
            lineCount++;
        }
        ps.close();

        System.out.println(output + "nulXY.csv");

        Job j = new Job(new Configuration());
        Configuration c = j.getConfiguration();
        fs.setConf(c);
        Path testFile = new Path(output, "nullXY.csv");
        testFile = fs.makeQualified(testFile);
        InputSplit split;
        long l;
        long start;

        TextInputFormat format = new TextInputFormat();
        split = new FileSplit(testFile, 0, lineCount * 1000, null);
        RecordReader<LongWritable, Text> reader2 = format.createRecordReader(split,
                HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID()));

        reader2.initialize(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID()));
        l = 0;
        start = System.currentTimeMillis();
        while (reader2.nextKeyValue()) {
            reader2.getCurrentValue().toString();
            l++;
        }
        Assert.assertEquals(lineCount, l);
        System.out.printf("text line reader with null x,y ignore: %d\n", System.currentTimeMillis() - start);

    } catch (Exception e) {
        e.printStackTrace();
        throw e;
    } finally {
        fs.close();
    }
}