Example usage for org.apache.hadoop.mapreduce RecordReader getCurrentKey

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader getCurrentKey.

Prototype

public abstract KEYIN getCurrentKey() throws IOException, InterruptedException;

Source Link

Document

Get the current key

Usage

From source file:org.apache.jena.hadoop.rdf.io.input.bnodes.AbstractBlankNodeTests.java

License:Apache License

/**
 * Test that starts with two blank nodes in two different files and checks
 * that writing them to a single file does not conflate them
 * /*  w w  w. jav  a 2  s  . c o m*/
 * @throws IOException
 * @throws InterruptedException
 */
@Test
public void blank_node_identity_02() throws IOException, InterruptedException {
    Assume.assumeTrue("Requires ParserProfile be respected", this.respectsParserProfile());
    Assume.assumeFalse("Requires that Blank Node identity not be preserved", this.preservesBlankNodeIdentity());

    // Temporary files
    File a = File.createTempFile("bnode_identity", getInitialInputExtension());
    File b = File.createTempFile("bnode_identity", getInitialInputExtension());
    File intermediateOutputDir = Files.createTempDirectory("bnode_identity", new FileAttribute[0]).toFile();

    try {
        // Prepare the input data
        // Same blank node but in different files so must be treated as
        // different blank nodes and not converge
        List<T> tuples = new ArrayList<>();
        Node bnode = NodeFactory.createBlankNode();
        Node pred = NodeFactory.createURI("http://example.org/predicate");

        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("first")));
        writeTuples(a, tuples);

        tuples.clear();
        tuples.add(createTuple(bnode, pred, NodeFactory.createLiteral("second")));
        writeTuples(b, tuples);

        // Set up fake job which will process the two files
        Configuration config = new Configuration(true);
        InputFormat<LongWritable, TValue> inputFormat = createInitialInputFormat();
        Job job = Job.getInstance(config);
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(a.getAbsolutePath()), new Path(b.getAbsolutePath()));
        FileOutputFormat.setOutputPath(job, new Path(intermediateOutputDir.getAbsolutePath()));
        JobContext context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        List<InputSplit> splits = inputFormat.getSplits(context);
        Assert.assertEquals(2, splits.size());

        // Prepare the output writing - putting all output to a single file
        OutputFormat<LongWritable, TValue> outputFormat = createIntermediateOutputFormat();
        TaskAttemptContext outputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                createAttemptID(1, 2, 1));
        RecordWriter<LongWritable, TValue> writer = outputFormat.getRecordWriter(outputTaskContext);

        for (InputSplit split : splits) {
            // Initialize the input reading
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    createAttemptID(1, 1, 1));
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            // Copy the input to the output - all triples go to a single
            // output
            while (reader.nextKeyValue()) {
                writer.write(reader.getCurrentKey(), reader.getCurrentValue());
            }
        }
        writer.close(outputTaskContext);

        // Promote outputs from temporary status
        promoteInputs(intermediateOutputDir);

        // Now we need to create a subsequent job that reads the
        // intermediate outputs
        // The Blank nodes should have been given separate identities so we
        // should not be conflating them, this is the opposite problem to
        // that described in JENA-820
        LOGGER.debug("Intermediate output directory is {}", intermediateOutputDir.getAbsolutePath());
        job = Job.getInstance(config);
        inputFormat = createIntermediateInputFormat();
        job.setInputFormatClass(inputFormat.getClass());
        NLineInputFormat.setNumLinesPerSplit(job, 100);
        FileInputFormat.setInputPaths(job, new Path(intermediateOutputDir.getAbsolutePath()));
        context = new JobContextImpl(job.getConfiguration(), job.getJobID());

        // Get the splits
        splits = inputFormat.getSplits(context);
        Assert.assertEquals(1, splits.size());

        // Expect to end up with a single blank node
        Set<Node> nodes = new HashSet<Node>();
        for (InputSplit split : splits) {
            TaskAttemptContext inputTaskContext = new TaskAttemptContextImpl(job.getConfiguration(),
                    new TaskAttemptID());
            RecordReader<LongWritable, TValue> reader = inputFormat.createRecordReader(split, inputTaskContext);
            reader.initialize(split, inputTaskContext);

            while (reader.nextKeyValue()) {
                nodes.add(getSubject(reader.getCurrentValue().get()));
            }
        }
        // Nodes must not diverge
        Assert.assertEquals(2, nodes.size());

    } finally {
        a.delete();
        b.delete();
        deleteDirectory(intermediateOutputDir);
    }
}

From source file:org.apache.jena.tdbloader4.partitioners.SplitSampler.java

License:Apache License

/**
 * From each split sampled, take the first numSamples / numSplits records.
 *///from   www .  ja  v a  2s. co m
@SuppressWarnings("unchecked")
// ArrayList::toArray doesn't preserve type
public K[] getSample(InputFormat<K, V> inf, Job job) throws IOException, InterruptedException {
    List<InputSplit> splits = inf.getSplits(job);
    ArrayList<K> samples = new ArrayList<K>(numSamples);
    int splitsToSample = Math.min(maxSplitsSampled, splits.size());
    int samplesPerSplit = numSamples / splitsToSample;
    log.debug("Sampling {} splits, taking {} samples per split", splitsToSample, samplesPerSplit);
    long records = 0;
    for (int i = 0; i < splitsToSample; ++i) {
        TaskAttemptContext samplingContext = new TaskAttemptContext(job.getConfiguration(),
                new TaskAttemptID());
        InputSplit split = splits.get(i);
        log.debug("Sampling {} split", split);
        RecordReader<K, V> reader = inf.createRecordReader(split, samplingContext);
        reader.initialize(split, samplingContext);
        while (reader.nextKeyValue()) {
            LongQuadWritable currentKey = (LongQuadWritable) reader.getCurrentKey();
            // TODO: why do we need to do that? Why on earth we have -1 in subject, predicate or object position???
            if ((currentKey.get(0) > 0) && (currentKey.get(1) > 0) && (currentKey.get(2) > 0)) {
                LongQuadWritable key = new LongQuadWritable(currentKey.get(0), currentKey.get(1),
                        currentKey.get(2), currentKey.get(3));
                log.debug("Sampled {}", key);
                samples.add((K) key);
                ++records;
                if (records >= (i + 1) * samplesPerSplit) {
                    log.debug("Records is {} and (i + 1) * samplesPerSplit is {}", records,
                            (i + 1) * samplesPerSplit);
                    break;
                }
            }
        }
        reader.close();
    }
    return (K[]) samples.toArray();
}

From source file:org.apache.mahout.classifier.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);//w ww  .  j  a  v a  2 s  .c  o  m
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    /* first instance id in hadoop's order */
    //int[] firstIds = new int[nbSplits];
    /* partitions' sizes in hadoop order */
    int[] sizes = new int[nbSplits];

    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (InputSplit split : splits) {
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
        reader.initialize(split, task);

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees);

        long time = System.currentTimeMillis();

        //firstIds[hp] = firstId;

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
            firstId++;
            sizes[hp]++;
        }

        mapper.cleanup(firstOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
    return true;
}

From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

@Override
protected boolean runJob(Job job) throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);//from   w  w  w .j  ava2 s .  c o  m
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    firstOutput = new MockContext(new Step1Mapper(), conf, task.getTaskAttemptID(), numTrees);

    firstIds = new int[nbSplits];
    sizes = new int[nbSplits];

    // to compute firstIds, process the splits in file order
    long slowest = 0; // duration of slowest map
    int firstId = 0;
    for (int p = 0; p < nbSplits; p++) {
        InputSplit split = splits.get(p);
        int hp = ArrayUtils.indexOf(sorted, split); // hadoop's partition

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);
        reader.initialize(split, task);

        Step1Mapper mapper = new MockStep1Mapper(getTreeBuilder(), dataset, getSeed(), hp, nbSplits, numTrees);

        long time = System.currentTimeMillis();

        firstIds[hp] = firstId;

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), firstOutput);
            firstId++;
            sizes[hp]++;
        }

        mapper.cleanup(firstOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
    return true;
}

From source file:org.apache.mahout.df.mapreduce.partial.PartialSequentialBuilder.java

License:Apache License

/**
 * The second step uses the trees to predict the rest of the instances outside
 * their own partition/*from w ww. j  a  v a  2  s  .  c  o m*/
 */
protected void secondStep(Configuration conf, Path forestPath, PredictionCallback callback)
        throws IOException, InterruptedException {
    JobContext jobContext = new JobContext(conf, new JobID());

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(jobContext);

    int nbSplits = splits.size();
    log.debug("Nb splits : {}", nbSplits);

    InputSplit[] sorted = new InputSplit[nbSplits];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    int numTrees = Builder.getNbTrees(conf); // total number of trees

    // compute the expected number of outputs
    int total = 0;
    for (int p = 0; p < nbSplits; p++) {
        total += Step2Mapper.nbConcerned(nbSplits, numTrees, p);
    }

    TaskAttemptContext task = new TaskAttemptContext(conf, new TaskAttemptID());

    secondOutput = new MockContext(new Step2Mapper(), conf, task.getTaskAttemptID(), numTrees);
    long slowest = 0; // duration of slowest map

    for (int partition = 0; partition < nbSplits; partition++) {

        InputSplit split = sorted[partition];
        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, task);

        // load the output of the 1st step
        int nbConcerned = Step2Mapper.nbConcerned(nbSplits, numTrees, partition);
        TreeID[] fsKeys = new TreeID[nbConcerned];
        Node[] fsTrees = new Node[nbConcerned];

        FileSystem fs = forestPath.getFileSystem(conf);
        int numInstances = InterResults.load(fs, forestPath, nbSplits, numTrees, partition, fsKeys, fsTrees);

        Step2Mapper mapper = new Step2Mapper();
        mapper.configure(partition, dataset, fsKeys, fsTrees, numInstances);

        long time = System.currentTimeMillis();

        while (reader.nextKeyValue()) {
            mapper.map(reader.getCurrentKey(), reader.getCurrentValue(), secondOutput);
        }

        mapper.cleanup(secondOutput);

        time = System.currentTimeMillis() - time;
        log.info("Duration : {}", DFUtils.elapsedTime(time));

        if (time > slowest) {
            slowest = time;
        }
    }

    log.info("Longest duration : {}", DFUtils.elapsedTime(slowest));
}

From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java

License:Apache License

public void testStep0Mapper() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES);
    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(NUM_MAPS, splits.size());

    InputSplit[] sorted = new InputSplit[NUM_MAPS];
    splits.toArray(sorted);/*from w  ww  . j a  va2 s. com*/
    Builder.sortSplits(sorted);

    Step0Context context = new Step0Context(new Step0Mapper(), job.getConfiguration(), new TaskAttemptID(),
            NUM_MAPS);

    for (int p = 0; p < NUM_MAPS; p++) {
        InputSplit split = sorted[p];

        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context);
        reader.initialize(split, context);

        Step0Mapper mapper = new Step0Mapper();
        mapper.configure(p);

        Long firstKey = null;
        int size = 0;

        while (reader.nextKeyValue()) {
            LongWritable key = reader.getCurrentKey();

            if (firstKey == null) {
                firstKey = key.get();
            }

            mapper.map(key, reader.getCurrentValue(), context);

            size++;
        }

        mapper.cleanup(context);

        // validate the mapper's output
        assertEquals(p, context.keys[p]);
        assertEquals(firstKey.longValue(), context.values[p].getFirstId());
        assertEquals(size, context.values[p].getSize());
    }

}

From source file:org.apache.mahout.df.mapreduce.partial.Step0JobTest.java

License:Apache License

public void testProcessOutput() throws Exception {
    Random rng = RandomUtils.getRandom();

    // create a dataset large enough to be split up
    String descriptor = Utils.randomDescriptor(rng, NUM_ATTRIBUTES);
    double[][] source = Utils.randomDoubles(rng, descriptor, NUM_INSTANCES);

    // each instance label is its index in the dataset
    int labelId = Utils.findLabel(descriptor);
    for (int index = 0; index < NUM_INSTANCES; index++) {
        source[index][labelId] = index;/*w w  w. j  a v a  2s  . c om*/
    }

    String[] sData = Utils.double2String(source);

    // write the data to a file
    Path dataPath = Utils.writeDataToTestFile(sData);

    // prepare a data converter
    Dataset dataset = DataLoader.generateDataset(descriptor, sData);
    DataConverter converter = new DataConverter(dataset);

    Job job = new Job();
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.setInputPaths(job, dataPath);

    setMaxSplitSize(job.getConfiguration(), dataPath, NUM_MAPS);

    // retrieve the splits
    TextInputFormat input = new TextInputFormat();
    List<InputSplit> splits = input.getSplits(job);
    assertEquals(NUM_MAPS, splits.size());

    InputSplit[] sorted = new InputSplit[NUM_MAPS];
    splits.toArray(sorted);
    Builder.sortSplits(sorted);

    List<Integer> keys = new ArrayList<Integer>();
    List<Step0Output> values = new ArrayList<Step0Output>();

    int[] expectedIds = new int[NUM_MAPS];

    TaskAttemptContext context = new TaskAttemptContext(job.getConfiguration(), new TaskAttemptID());

    for (int p = 0; p < NUM_MAPS; p++) {
        InputSplit split = sorted[p];
        RecordReader<LongWritable, Text> reader = input.createRecordReader(split, context);
        reader.initialize(split, context);

        Long firstKey = null;
        int size = 0;

        while (reader.nextKeyValue()) {
            LongWritable key = reader.getCurrentKey();
            Text value = reader.getCurrentValue();

            if (firstKey == null) {
                firstKey = key.get();
                expectedIds[p] = converter.convert(0, value.toString()).getLabel();
            }

            size++;
        }

        keys.add(p);
        values.add(new Step0Output(firstKey, size));
    }

    Step0Output[] partitions = Step0Job.processOutput(keys, values);

    int[] actualIds = Step0Output.extractFirstIds(partitions);

    assertTrue("Expected: " + Arrays.toString(expectedIds) + " But was: " + Arrays.toString(actualIds),
            Arrays.equals(expectedIds, actualIds));
}

From source file:org.apache.rya.accumulo.mr.GraphXEdgeInputFormatTest.java

License:Apache License

@SuppressWarnings("rawtypes")
@Test//  w  w w  .jav a  2 s . c  o m
public void testInputFormat() throws Exception {
    RyaStatement input = RyaStatement.builder().setSubject(new RyaURI("http://www.google.com"))
            .setPredicate(new RyaURI("http://some_other_uri")).setObject(new RyaURI("http://www.yahoo.com"))
            .setColumnVisibility(new byte[0]).setValue(new byte[0]).build();

    apiImpl.add(input);

    Job jobConf = Job.getInstance();

    GraphXEdgeInputFormat.setMockInstance(jobConf, instance.getInstanceName());
    GraphXEdgeInputFormat.setConnectorInfo(jobConf, username, password);
    GraphXEdgeInputFormat.setTableLayout(jobConf, TABLE_LAYOUT.SPO);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);
    GraphXEdgeInputFormat.setInputTableName(jobConf, table);

    GraphXEdgeInputFormat.setScanIsolation(jobConf, false);
    GraphXEdgeInputFormat.setLocalIterators(jobConf, false);
    GraphXEdgeInputFormat.setOfflineTableScan(jobConf, false);

    GraphXEdgeInputFormat inputFormat = new GraphXEdgeInputFormat();

    JobContext context = new JobContextImpl(jobConf.getConfiguration(), jobConf.getJobID());

    List<InputSplit> splits = inputFormat.getSplits(context);

    Assert.assertEquals(1, splits.size());

    TaskAttemptContext taskAttemptContext = new TaskAttemptContextImpl(context.getConfiguration(),
            new TaskAttemptID(new TaskID(), 1));

    RecordReader reader = inputFormat.createRecordReader(splits.get(0), taskAttemptContext);

    RecordReader ryaStatementRecordReader = (RecordReader) reader;
    ryaStatementRecordReader.initialize(splits.get(0), taskAttemptContext);

    List<Edge> results = new ArrayList<Edge>();
    while (ryaStatementRecordReader.nextKeyValue()) {
        Edge writable = (Edge) ryaStatementRecordReader.getCurrentValue();
        long srcId = writable.srcId();
        long destId = writable.dstId();
        RyaTypeWritable rtw = null;
        Object text = ryaStatementRecordReader.getCurrentKey();
        Edge<RyaTypeWritable> edge = new Edge<RyaTypeWritable>(srcId, destId, rtw);
        results.add(edge);

        System.out.println(text);
    }

    System.out.println(results.size());
    System.out.println(results);
    Assert.assertTrue(results.size() == 2);
}

From source file:org.apache.tinkerpop.gremlin.hadoop.structure.io.RecordReaderWriterTest.java

License:Apache License

private static void validateFileSplits(final List<FileSplit> fileSplits, final Configuration configuration,
        final Class<? extends InputFormat<NullWritable, VertexWritable>> inputFormatClass,
        final Optional<Class<? extends OutputFormat<NullWritable, VertexWritable>>> outFormatClass)
        throws Exception {

    final InputFormat inputFormat = ReflectionUtils.newInstance(inputFormatClass, configuration);
    final TaskAttemptContext job = new TaskAttemptContextImpl(configuration,
            new TaskAttemptID(UUID.randomUUID().toString(), 0, TaskType.MAP, 0, 0));

    int vertexCount = 0;
    int outEdgeCount = 0;
    int inEdgeCount = 0;

    final OutputFormat<NullWritable, VertexWritable> outputFormat = outFormatClass.isPresent()
            ? ReflectionUtils.newInstance(outFormatClass.get(), configuration)
            : null;//w  ww.  j  av  a 2 s .  co m
    final RecordWriter<NullWritable, VertexWritable> writer = null == outputFormat ? null
            : outputFormat.getRecordWriter(job);

    boolean foundKeyValue = false;
    for (final FileSplit split : fileSplits) {
        logger.info("\treading file split {}", split.getPath().getName() + " ({}",
                split.getStart() + "..." + (split.getStart() + split.getLength()), "{} {} bytes)");
        final RecordReader reader = inputFormat.createRecordReader(split, job);

        float lastProgress = -1f;
        while (reader.nextKeyValue()) {
            //System.out.println("" + reader.getProgress() + "> " + reader.getCurrentKey() + ": " + reader.getCurrentValue());
            final float progress = reader.getProgress();
            assertTrue(progress >= lastProgress);
            assertEquals(NullWritable.class, reader.getCurrentKey().getClass());
            final VertexWritable vertexWritable = (VertexWritable) reader.getCurrentValue();
            if (null != writer)
                writer.write(NullWritable.get(), vertexWritable);
            vertexCount++;
            outEdgeCount = outEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.OUT));
            inEdgeCount = inEdgeCount + (int) IteratorUtils.count(vertexWritable.get().edges(Direction.IN));
            //
            final Vertex vertex = vertexWritable.get();
            assertEquals(Integer.class, vertex.id().getClass());
            if (vertex.value("name").equals("SUGAR MAGNOLIA")) {
                foundKeyValue = true;
                assertEquals(92, IteratorUtils.count(vertex.edges(Direction.OUT)));
                assertEquals(77, IteratorUtils.count(vertex.edges(Direction.IN)));
            }
            lastProgress = progress;
        }
    }

    assertEquals(8049, outEdgeCount);
    assertEquals(8049, inEdgeCount);
    assertEquals(outEdgeCount, inEdgeCount);
    assertEquals(808, vertexCount);
    assertTrue(foundKeyValue);

    if (null != writer) {
        writer.close(new TaskAttemptContextImpl(configuration, job.getTaskAttemptID()));
        for (int i = 1; i < 10; i++) {
            final File outputDirectory = new File(
                    new URL(configuration.get("mapreduce.output.fileoutputformat.outputdir")).toURI());
            final List<FileSplit> splits = generateFileSplits(
                    new File(outputDirectory.getAbsoluteFile() + "/_temporary/0/_temporary/"
                            + job.getTaskAttemptID().getTaskID().toString().replace("task", "attempt") + "_0"
                            + "/part-m-00000"),
                    i);
            validateFileSplits(splits, configuration, inputFormatClass, Optional.empty());
        }
    }
}

From source file:org.commoncrawl.hadoop.io.mapreduce.ArcFileInputFormatTests.java

License:Apache License

static void validateSplit(FileSystem fs, InputSplit split, List<Pair<Path, List<TestRecord>>> splits,
        RecordReader<Text, BytesWritable> reader) throws IOException, InterruptedException {

    int splitDataIndex = getIndexOfSplit(splits, split);

    Assert.assertTrue(splitDataIndex != -1);

    List<TestRecord> records = splits.get(splitDataIndex).e1;

    int itemIndex = 0;
    // iterate and validate stuff ... 
    while (reader.nextKeyValue()) {
        Text key = reader.getCurrentKey();
        BytesWritable value = reader.getCurrentValue();

        TestRecord testRecord = records.get(itemIndex++);
        // get test key bytes as utf-8 bytes ... 
        byte[] testKeyBytes = testRecord.url.getBytes(Charset.forName("UTF-8"));
        // compare against raw key bytes to validate key is the same (Text's utf-8 mapping code replaces invalid characters 
        // with ?, which causes our test case (which does use invalid characters to from the key, to break.
        Assert.assertTrue(ArcFileReaderTests.compareTo(testKeyBytes, 0, testKeyBytes.length, key.getBytes(), 0,
                key.getLength()) == 0);/*  ww w.ja  v a  2s  . com*/
        // retured bytes represent the header(encoded in utf-8), terminated by a \r\n\r\n. The content follows this terminator
        // we search for this specific byte pattern to locate start of content, then compare it against source ... 
        int indexofHeaderTerminator = ByteArrayUtils.indexOf(value.getBytes(), 0, value.getLength(),
                "\r\n\r\n".getBytes());
        indexofHeaderTerminator += 4;
        Assert.assertTrue(ArcFileReaderTests.compareTo(testRecord.data, 0, testRecord.data.length,
                value.getBytes(), indexofHeaderTerminator, testRecord.data.length) == 0);
    }
    reader.close();

    Assert.assertEquals(itemIndex, ArcFileReaderTests.BASIC_TEST_RECORD_COUNT);

    splits.remove(splitDataIndex);

}