List of usage examples for org.apache.hadoop.mapreduce.lib.input TextInputFormat createRecordReader
@Override
public RecordReader<LongWritable, Text> createRecordReader(InputSplit split, TaskAttemptContext context)
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
private List<Map.Entry> previewTextBatch(FileStatus fileStatus, int batchSize) throws IOException, InterruptedException { TextInputFormat textInputFormat = new TextInputFormat(); InputSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), null); TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(hadoopConf, TaskAttemptID.forName("attempt_1439420318532_0011_m_000000_0")); RecordReader<LongWritable, Text> recordReader = textInputFormat.createRecordReader(fileSplit, taskAttemptContext);//ww w .ja va2 s . c o m recordReader.initialize(fileSplit, taskAttemptContext); boolean hasNext = recordReader.nextKeyValue(); List<Map.Entry> batch = new ArrayList<>(); while (hasNext && batch.size() < batchSize) { batch.add(new Pair(fileStatus.getPath().toUri().getPath() + "::" + recordReader.getCurrentKey(), String.valueOf(recordReader.getCurrentValue()))); hasNext = recordReader.nextKeyValue(); // not like iterator.hasNext, actually advances } return batch; }
From source file:org.apache.mahout.classifier.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);// w w w. j a v a 2s. com Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); /* first instance id in hadoop's order */ //int[] firstIds = new int[nbSplits]; /* partitions' sizes in hadoop order */ int[] sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (InputSplit split : splits) { int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); //firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
@Override protected boolean runJob(Job job) throws IOException, InterruptedException { Configuration conf = job.getConfiguration(); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted);/*w ww .j a va2s. c o m*/ Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees); firstIds = new int[nbSplits]; sizes = new int[nbSplits]; // to compute firstIds, process the splits in file order long slowest = 0; // duration of slowest map int firstId = 0; for (int p = 0; p < nbSplits; p++) { InputSplit split = splits.get(p); int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); reader.initialize(split, task); Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees); long time = System.currentTimeMillis(); firstIds[hp] = firstId; while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput); firstId++; sizes[hp]++; } mapper.cleanup(firstOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); return true; }
From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java
License:Apache License
/** * The second step uses the trees to predict the rest of the instances outside * their own partition/* w ww . j ava 2 s . c o m*/ */ protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback) throws IOException, InterruptedException { JobContext jobContext = new JobContext(conf, new JobID()); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(jobContext); int nbSplits = splits.size(); log.debug("Nb splits : {}", nbSplits); InputSplit[] sorted = new InputSplit[nbSplits]; splits.toArray(sorted); Builder.sortSplits(sorted); int numTrees = Builder.getNbTrees(conf); // total number of trees // compute the expected number of outputs int total = 0; for (int p = 0; p < nbSplits; p++) { total += Step2Mapper.nbConcerned(nbSplits, numTrees, p); } TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID()); secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees); long slowest = 0; // duration of slowest map for (int partition = 0; partition < nbSplits; partition++) { InputSplit split = sorted[partition]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task); // load the output of the 1st step int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition); TreeID[] fsKeys = new TreeID[nbConcerned]; Node[] fsTrees = new Node[nbConcerned]; FileSystem fs = forestPath.getFileSystem(conf); int numInstances = InterResults.load(fs, forestPath, nbSplits, numTrees, partition, fsKeys, fsTrees); Step2Mapper mapper = new Step2Mapper(); mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances); long time = System.currentTimeMillis(); while (reader.nextKeyValue()) { mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), secondOutput); } mapper.cleanup(secondOutput); time = System.currentTimeMillis() - time; log.info("Duration : {}", DFUtils.elapsedTime(time)); if (time > slowest) { slowest = time; } } log.info("Longest duration : {}", DFUtils.elapsedTime(slowest)); }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testStep0Mapper() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted);/*from w w w .j a v a2 s. c om*/ Builder.sortSplits(sorted); Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(), NUM_MAPS); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Step0Mapper mapper = new Step0Mapper(); mapper.configure(p); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); if (firstKey == null) { firstKey = key.get(); } mapper.map(key, reader.getCurrentValue(), context); size++; } mapper.cleanup(context); // validate the mapper's output assertEquals(p, context.keys[p]); assertEquals(firstKey.longValue(), context.values[p].getFirstId()); assertEquals(size, context.values[p].getSize()); } }
From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java
License:Apache License
public void testProcessOutput() throws Exception { Random rng = RandomUtils.getRandom(); // create a dataset large enough to be split up String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES); double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES); // each instance label is its index in the dataset int labelId = Utils.findLabel(descriptor); for (int index = 0; index < NUM_INSTANCES; index++) { source[index][labelId] = index;//from ww w.j a v a2 s .c o m } String[] sData = Utils.double2String(source); // write the data to a file Path dataPath = Utils.writeDataToTestFile(sData); // prepare a data converter Dataset dataset = DataLoader.generateDataset(descriptor, sData); DataConverter converter = new DataConverter(dataset); Job job = new Job(); job.setInputFormatClass(TextInputFormat.class); FileInputFormat.setInputPaths(job, dataPath); setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS); // retrieve the splits TextInputFormat input = new TextInputFormat(); List<InputSplit> splits = input.getSplits(job); assertEquals(NUM_MAPS, splits.size()); InputSplit[] sorted = new InputSplit[NUM_MAPS]; splits.toArray(sorted); Builder.sortSplits(sorted); List<Integer> keys = new ArrayList<Integer>(); List<Step0Output> values = new ArrayList<Step0Output>(); int[] expectedIds = new int[NUM_MAPS]; TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID()); for (int p = 0; p < NUM_MAPS; p++) { InputSplit split = sorted[p]; RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context); reader.initialize(split, context); Long firstKey = null; int size = 0; while (reader.nextKeyValue()) { LongWritable key = reader.getCurrentKey(); Text value = reader.getCurrentValue(); if (firstKey == null) { firstKey = key.get(); expectedIds[p] = converter.convert(0, value.toString()).getLabel(); } size++; } keys.add(p); values.add(new Step0Output(firstKey, size)); } Step0Output[] partitions = Step0Job.processOutput(keys, values); int[] actualIds = Step0Output.extractFirstIds(partitions); assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds), Arrays.equals(expectedIds, actualIds)); }
From source file:org.mrgeo.format.CsvInputFormatTest.java
License:Apache License
@Test @Category(UnitTest.class) public void testNullIgnore() throws Exception { FileSystem fs = new RawLocalFileSystem(); try {//from w ww . j a v a2s .c o m int lineCount = 0; // Write columns file which defines the columns title and type String cstr = "<?xml version='1.0' encoding='UTF-8'?>\n<AllColumns firstLineHeader='false'>\n"; cstr += " <Column name='name' type='Nominal'/>\n"; cstr += " <Column name='x' type='Numeric'/>\n"; cstr += " <Column name='y' type='Numeric'/>\n"; cstr += "</AllColumns>\n"; FileOutputStream fos = new FileOutputStream(output + "/nulXY.csv.columns"); PrintStream ps = new PrintStream(fos); ps.print(cstr); ps.close(); // Write csv test data fos = new FileOutputStream(output + "/nullXY.csv"); ps = new PrintStream(fos); // populated rows for (int ii = 0; ii < 10; ii++) { ps.print("ASDF,1.0,1.0\n"); lineCount++; } // empty rows ps.print("ASDF,,1.0\n"); ps.print("ASDF,1.0,\n"); ps.print("ASDF,,\n"); lineCount += 3; // populated rows for (int ii = 0; ii < 5; ii++) { ps.print("ASDF,1.0,1.0\n"); lineCount++; } ps.close(); System.out.println(output + "nulXY.csv"); Job j = new Job(new Configuration()); Configuration c = j.getConfiguration(); fs.setConf(c); Path testFile = new Path(output, "nullXY.csv"); testFile = fs.makeQualified(testFile); InputSplit split; long l; long start; TextInputFormat format = new TextInputFormat(); split = new FileSplit(testFile, 0, lineCount * 1000, null); RecordReader<LongWritable, Text> reader2 = format.createRecordReader(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID())); reader2.initialize(split, HadoopUtils.createTaskAttemptContext(c, new TaskAttemptID())); l = 0; start = System.currentTimeMillis(); while (reader2.nextKeyValue()) { reader2.getCurrentValue().toString(); l++; } Assert.assertEquals(lineCount, l); System.out.printf("text line reader with null x,y ignore: %d\n", System.currentTimeMillis() - start); } catch (Exception e) { e.printStackTrace(); throw e; } finally { fs.close(); } }