Example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce RecordReader nextKeyValue.

Prototype

public abstract boolean nextKeyValue() throws IOException, InterruptedException;

Source Link

Document

Read the next key, value pair.

Usage

From source file:edu.umn.cs.sthadoop.trajectory.KNNDTW.java

License:Open Source License

private static <S extends Shape> long knnLocal(Path inFile, Path outPath, OperationsParams params)
        throws IOException, InterruptedException {
    int iterations = 0;
    FileSystem fs = inFile.getFileSystem(params);
    Point queryPoint = (Point) OperationsParams.getShape(params, "point");
    int k = params.getInt("k", 1);
    // Top-k objects are retained in this object
    PriorityQueue<ShapeWithDistance<S>> knn = new KNNObjects<ShapeWithDistance<S>>(k);

    SpatialInputFormat3<Rectangle, Shape> inputFormat = new SpatialInputFormat3<Rectangle, Shape>();

    final GlobalIndex<Partition> gIndex = SpatialSite.getGlobalIndex(fs, inFile);
    double kthDistance = Double.MAX_VALUE;
    if (gIndex != null) {
        // There is a global index, use it
        PriorityQueue<ShapeWithDistance<Partition>> partitionsToProcess = new PriorityQueue<KNNDTW.ShapeWithDistance<Partition>>() {
            {/*from w  w w .  j  a  v a2  s .c  o  m*/
                initialize(gIndex.size());
            }

            @Override
            protected boolean lessThan(Object a, Object b) {
                return ((ShapeWithDistance<Partition>) a).distance < ((ShapeWithDistance<Partition>) b).distance;
            }
        };
        for (Partition p : gIndex) {
            double distance = p.getMinDistanceTo(queryPoint.x, queryPoint.y);
            partitionsToProcess.insert(new ShapeWithDistance<Partition>(p.clone(), distance));
        }

        while (partitionsToProcess.size() > 0 && partitionsToProcess.top().distance <= kthDistance) {

            ShapeWithDistance<Partition> partitionToProcess = partitionsToProcess.pop();
            // Process this partition
            Path partitionPath = new Path(inFile, partitionToProcess.shape.filename);
            long length = fs.getFileStatus(partitionPath).getLen();
            FileSplit fsplit = new FileSplit(partitionPath, 0, length, new String[0]);
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(fsplit, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(fsplit, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(fsplit, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }
            iterations++;

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                for (Shape shape : shapes) {
                    double distance = shape.distanceTo(queryPoint.x, queryPoint.y);
                    if (distance <= kthDistance)
                        knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance));
                }
            }
            reader.close();

            if (knn.size() >= k)
                kthDistance = knn.top().distance;
        }
    } else {
        // No global index, have to scan the whole file
        Job job = new Job(params);
        SpatialInputFormat3.addInputPath(job, inFile);
        List<InputSplit> splits = inputFormat.getSplits(job);

        for (InputSplit split : splits) {
            RecordReader<Rectangle, Iterable<Shape>> reader = inputFormat.createRecordReader(split, null);
            if (reader instanceof SpatialRecordReader3) {
                ((SpatialRecordReader3) reader).initialize(split, params);
            } else if (reader instanceof RTreeRecordReader3) {
                ((RTreeRecordReader3) reader).initialize(split, params);
            } else if (reader instanceof HDFRecordReader) {
                ((HDFRecordReader) reader).initialize(split, params);
            } else {
                throw new RuntimeException("Unknown record reader");
            }
            iterations++;

            while (reader.nextKeyValue()) {
                Iterable<Shape> shapes = reader.getCurrentValue();
                for (Shape shape : shapes) {
                    double distance = shape.distanceTo(queryPoint.x, queryPoint.y);
                    knn.insert(new ShapeWithDistance<S>((S) shape.clone(), distance));
                }
            }

            reader.close();
        }
        if (knn.size() >= k)
            kthDistance = knn.top().distance;
    }
    long resultCount = knn.size();
    if (outPath != null && params.getBoolean("output", true)) {
        FileSystem outFS = outPath.getFileSystem(params);
        PrintStream ps = new PrintStream(outFS.create(outPath));
        Vector<ShapeWithDistance<S>> resultsOrdered = new Vector<ShapeWithDistance<S>>((int) resultCount);
        resultsOrdered.setSize((int) resultCount);
        while (knn.size() > 0) {
            ShapeWithDistance<S> nextAnswer = knn.pop();
            resultsOrdered.set(knn.size(), nextAnswer);
        }

        Text text = new Text();
        for (ShapeWithDistance<S> answer : resultsOrdered) {
            text.clear();
            TextSerializerHelper.serializeDouble(answer.distance, text, ',');
            answer.shape.toText(text);
            ps.println(text);
        }
        ps.close();
    }
    TotalIterations.addAndGet(iterations);
    return resultCount;

}

From source file:eu.scape_project.tb.wc.archd.test.ARCTest.java

License:Apache License

/**
 * Test of nextKeyValue method, of class ArcRecordReader.
 *//*from w  w w  .j  a  va 2  s. c o m*/
public void testNextKeyValue() throws Exception {
    RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac);
    recordReader.initialize(split, tac);
    int start = 1;
    while (recordReader.nextKeyValue()) {
        Text currKey = recordReader.getCurrentKey();
        ArcRecord currValue = recordReader.getCurrentValue();

        String currMIMEType = currValue.getMimeType();
        String currType = currValue.getType();
        String currURL = currValue.getUrl();
        InputStream currStream = currValue.getContents();
        String currContent;
        String myContentString;
        int myContentStringIndex;
        Date currDate = currValue.getDate();
        int currHTTPrc = currValue.getHttpReturnCode();
        int currLength = currValue.getLength();

        System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: "
                + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc
                + " Length: " + currLength);

        // check example record 1 (first one and the header of the ARC file)
        if (start == 1) {
            //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream.
            //We will search for the string int the content we read and compare it to the values we know.                
            currContent = content2String(currStream);
            myContentString = "defaultgz_orderxml";
            myContentStringIndex = currContent.indexOf(myContentString);
            //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex);

            assertEquals("ID not equal", "20130522085320/filedesc://3-2-20130522085320-00000-prepc2.arc",
                    currKey.toString());
            assertEquals("MIME Type not equal", "text/plain", currMIMEType);
            assertEquals("Response type not equal", "response", currType);
            assertEquals("URL not equal", "filedesc://3-2-20130522085320-00000-prepc2.arc", currURL);
            assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 08:53:20"));
            assertEquals("HTTPrc not equal", -1, currHTTPrc);
            assertEquals("Record length not equal", 1190, currLength);
            assertEquals("Content seems not to be correct", 531, myContentStringIndex);
        }
        start++;
    }
}

From source file:eu.scape_project.tb.wc.archd.test.WARCTest.java

License:Apache License

/**
 * Test of nextKeyValue method, of class ArcRecordReader.
 *///from  ww  w .j  a va 2 s . com
public void testNextKeyValue() throws Exception {
    RecordReader<Text, ArcRecord> recordReader = myArcF.createRecordReader(split, tac);
    recordReader.initialize(split, tac);
    int start = 1;
    while (recordReader.nextKeyValue()) {
        Text currKey = recordReader.getCurrentKey();
        ArcRecord currValue = recordReader.getCurrentValue();

        String currMIMEType = currValue.getMimeType();
        String currType = currValue.getType();
        String currURL = currValue.getUrl();
        InputStream currStream = currValue.getContents();
        String currContent;
        String myContentString;
        int myContentStringIndex;
        Date currDate = currValue.getDate();
        int currHTTPrc = currValue.getHttpReturnCode();
        int currLength = currValue.getLength();

        System.out.println("KEY " + start + ": " + currKey + " MIME Type: " + currMIMEType + " Type: "
                + currType + " URL: " + currURL + " Date: " + currDate.toString() + " HTTPrc: " + currHTTPrc
                + " Length: " + currLength);

        // check example record 1 (first one and the header of the WARC file)
        if (start == 1) {
            //"myContentString" is arbitrary sting snipped of which we know that it exists in the content stream and of which we know the position in the stream.
            //We will search for the string int the content we read and compare it to the values we know.                
            currContent = content2String(currStream);
            myContentString = "isPartOf: basic";
            myContentStringIndex = currContent.indexOf(myContentString);
            //System.out.println("Search for: " + myContentString + "=> Index is: " + myContentStringIndex);

            assertEquals("ID not equal", "<urn:uuid:18cfb53d-1c89-4cc6-863f-e5535d430c95>", currKey.toString());
            assertEquals("MIME Type not equal", "application/warc-fields", currMIMEType);
            assertEquals("Response type not equal", "warcinfo", currType);
            assertEquals("URL not equal", null, currURL);
            assertTrue("Date not correct", currDate.toString().startsWith("Wed May 22 12:27:40"));
            assertEquals("HTTPrc not equal", -1, currHTTPrc);
            assertEquals("Record length not equal", 374, currLength);
            assertEquals("Content mismatch", 202, myContentStringIndex);
        }
        start++;
    }
}

From source file:gobblin.runtime.mapreduce.GobblinWorkUnitsInputFormatTest.java

License:Apache License

@Test
public void testRecordReader() throws Exception {

    List<String> paths = Lists.newArrayList("/path1", "/path2");
    GobblinWorkUnitsInputFormat.GobblinSplit split = new GobblinWorkUnitsInputFormat.GobblinSplit(paths);

    GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
    RecordReader<LongWritable, Text> recordReader = inputFormat.createRecordReader(split,
            new TaskAttemptContextImpl(new Configuration(), new TaskAttemptID("a", 1, TaskType.MAP, 1, 1)));

    recordReader.nextKeyValue();
    Assert.assertEquals(recordReader.getCurrentKey().get(), 0);
    Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path1");

    recordReader.nextKeyValue();/*from  w ww  . ja va2s  .c o  m*/
    Assert.assertEquals(recordReader.getCurrentKey().get(), 1);
    Assert.assertEquals(recordReader.getCurrentValue().toString(), "/path2");

    Assert.assertFalse(recordReader.nextKeyValue());

}

From source file:info.halo9pan.word2vec.hadoop.mr.SortInputFormat.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 10 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * /*ww  w. jav  a  2  s.c  om*/
 * @param job
 *            the job to sample
 * @param partFile
 *            where to write the output file to
 * @throws Throwable
 *             if something goes wrong
 */
public static void writePartitionFile(final JobContext job, Path partFile) throws Throwable {
    long t1 = System.currentTimeMillis();
    Configuration conf = job.getConfiguration();
    final SortInputFormat inFormat = new SortInputFormat();
    final TextSampler sampler = new TextSampler();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
    final List<InputSplit> splits = inFormat.getSplits(job);
    long t2 = System.currentTimeMillis();
    System.out.println("Computing input splits took " + (t2 - t1) + "ms");
    int samples = Math.min(conf.getInt(NUM_PARTITIONS, 10), splits.size());
    System.out.println("Sampling " + samples + " splits of " + splits.size());
    final long recordsPerSample = sampleSize / samples;
    final int sampleStep = splits.size() / samples;
    Thread[] samplerReader = new Thread[samples];
    SamplerThreadGroup threadGroup = new SamplerThreadGroup("Sampler Reader Thread Group");
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        final int idx = i;
        samplerReader[i] = new Thread(threadGroup, "Sampler Reader " + idx) {
            {
                setDaemon(true);
            }

            public void run() {
                long records = 0;
                try {
                    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(),
                            new TaskAttemptID());
                    RecordReader<Text, Text> reader = inFormat.createRecordReader(splits.get(sampleStep * idx),
                            context);
                    reader.initialize(splits.get(sampleStep * idx), context);
                    while (reader.nextKeyValue()) {
                        sampler.addKey(new Text(reader.getCurrentKey()));
                        records += 1;
                        if (recordsPerSample <= records) {
                            break;
                        }
                    }
                } catch (IOException ie) {
                    System.err.println(
                            "Got an exception while reading splits " + StringUtils.stringifyException(ie));
                    throw new RuntimeException(ie);
                } catch (InterruptedException e) {

                }
            }
        };
        samplerReader[i].start();
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    DataOutputStream writer = outFs.create(partFile, true, 64 * 1024, (short) 10,
            outFs.getDefaultBlockSize(partFile));
    for (int i = 0; i < samples; i++) {
        try {
            samplerReader[i].join();
            if (threadGroup.getThrowable() != null) {
                throw threadGroup.getThrowable();
            }
        } catch (InterruptedException e) {
        }
    }
    for (Text split : sampler.createPartitions(partitions)) {
        split.write(writer);
    }
    writer.close();
    long t3 = System.currentTimeMillis();
    System.out.println("Computing parititions took " + (t3 - t2) + "ms");
}

From source file:io.druid.data.input.orc.DruidOrcInputFormatTest.java

License:Apache License

@Test
public void testRead() throws IOException, InterruptedException {
    InputFormat inputFormat = ReflectionUtils.newInstance(OrcNewInputFormat.class, job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);
    OrcHadoopInputRowParser parser = (OrcHadoopInputRowParser) config.getParser();

    reader.initialize(split, context);//from w ww. j ava  2  s .  c o m

    reader.nextKeyValue();

    OrcStruct data = (OrcStruct) reader.getCurrentValue();

    MapBasedInputRow row = (MapBasedInputRow) parser.parse(data);

    Assert.assertTrue(row.getEvent().keySet().size() == 4);
    Assert.assertEquals(new DateTime(timestamp), row.getTimestamp());
    Assert.assertEquals(parser.getParseSpec().getDimensionsSpec().getDimensionNames(), row.getDimensions());
    Assert.assertEquals(col1, row.getEvent().get("col1"));
    Assert.assertEquals(Arrays.asList(col2), row.getDimension("col2"));

    reader.close();
}

From source file:io.druid.data.input.parquet.DruidParquetInputFormatTest.java

License:Apache License

@Test
public void test() throws IOException, InterruptedException {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);//www . j  a va2  s. com

    HadoopDruidIndexerConfig config = HadoopDruidIndexerConfig
            .fromFile(new File("example/wikipedia_hadoop_parquet_job.json"));

    config.intoConfiguration(job);

    File testFile = new File("example/wikipedia_list.parquet");
    Path path = new Path(testFile.getAbsoluteFile().toURI());
    FileSplit split = new FileSplit(path, 0, testFile.length(), null);

    InputFormat inputFormat = ReflectionUtils.newInstance(DruidParquetInputFormat.class,
            job.getConfiguration());

    TaskAttemptContext context = new TaskAttemptContextImpl(job.getConfiguration(), new TaskAttemptID());
    RecordReader reader = inputFormat.createRecordReader(split, context);

    reader.initialize(split, context);

    reader.nextKeyValue();

    GenericRecord data = (GenericRecord) reader.getCurrentValue();

    // field not read, should return null
    assertEquals(data.get("added"), null);

    assertEquals(data.get("page"), new Utf8("Gypsy Danger"));

    reader.close();
}

From source file:io.ssc.trackthetrackers.extraction.hadoop.util.Compaction.java

License:Open Source License

public static void main(String[] args) throws IOException, InterruptedException {

    if (args.length != 2) {
        System.out.println("Usage: <input folder> <output file>");
        System.exit(-1);/*from w ww .ja va 2  s .  c om*/
    }

    String inputPath = args[0];
    String outputFile = args[1];

    Configuration conf = new Configuration();

    FileSystem fs = FileSystem.get(conf);

    FileStatus[] input = fs.listStatus(new Path(inputPath), new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.toString().endsWith(".parquet");
        }
    });

    Path output = new Path(outputFile);

    fs.delete(output, true);

    ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder> inputFormat = new ProtoParquetInputFormat<ParsedPageProtos.ParsedPageOrBuilder>();
    inputFormat.setReadSupportClass(new JobConf(conf), ProtoReadSupport.class);

    Job job = new Job(conf);
    ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage> outputFormat = new ProtoParquetOutputFormat<ParsedPageProtos.ParsedPage>(
            ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setProtobufClass(job, ParsedPageProtos.ParsedPage.class);
    ProtoParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
    ProtoParquetOutputFormat.setEnableDictionary(job, true);

    RecordWriter<Void, ParsedPageProtos.ParsedPage> recordWriter = outputFormat.getRecordWriter(conf, output,
            CompressionCodecName.SNAPPY);

    List<ParquetInputSplit> splits = new ArrayList<ParquetInputSplit>();

    for (FileStatus fileStatus : input) {
        System.out.println(fileStatus.getPath().toString());
        splits.addAll(inputFormat.getSplits(conf, ParquetFileReader.readFooters(conf, fileStatus)));
    }

    int splitIndex = 0;
    for (ParquetInputSplit split : splits) {

        System.out.println("Processing split: " + split.getPath().toString() + "(" + splitIndex + " of "
                + splits.size() + ")");

        TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", splitIndex, true, splitIndex),
                splitIndex);
        TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

        RecordReader<Void, ParsedPageProtos.ParsedPageOrBuilder> reader = inputFormat.createRecordReader(split,
                ctx);
        reader.initialize(split, ctx);

        while (reader.nextKeyValue()) {

            ParsedPageProtos.ParsedPageOrBuilder record = reader.getCurrentValue();

            ParsedPageProtos.ParsedPage.Builder builder = ParsedPageProtos.ParsedPage.newBuilder();

            builder.setUrl(record.getUrl());
            builder.setArchiveTime(record.getArchiveTime());

            builder.addAllScripts(record.getScriptsList());
            builder.addAllIframes(record.getIframesList());
            builder.addAllLinks(record.getLinksList());
            builder.addAllImages(record.getImagesList());

            recordWriter.write(null, builder.build());
        }

        if (reader != null) {
            reader.close();
        }

        splitIndex++;
    }

    TaskAttemptID taskAttemptID = new TaskAttemptID(new TaskID("identifier", 1, true, 1), 1);
    TaskAttemptContext ctx = new org.apache.hadoop.mapreduce.TaskAttemptContext(conf, taskAttemptID);

    if (recordWriter != null) {
        recordWriter.close(ctx);
    }

}

From source file:it.crs4.pydoop.mapreduce.pipes.PipesMapper.java

License:Apache License

@Override
public void run(Context context) throws IOException, InterruptedException {
    setup(context);/* www. j a va2  s  .c  o  m*/
    Configuration conf = context.getConfiguration();
    InputSplit split = context.getInputSplit();
    // FIXME: do we really need to be so convoluted?
    InputFormat<K1, V1> inputFormat;
    try {
        inputFormat = (InputFormat<K1, V1>) ReflectionUtils.newInstance(context.getInputFormatClass(), conf);
    } catch (ClassNotFoundException ce) {
        throw new RuntimeException("class not found", ce);
    }
    RecordReader<K1, V1> input = inputFormat.createRecordReader(split, context);
    input.initialize(split, context);
    boolean isJavaInput = Submitter.getIsJavaRecordReader(conf);
    try {
        // FIXME: what happens for a java mapper and no java record reader?
        DummyRecordReader fakeInput = (!isJavaInput && !Submitter.getIsJavaMapper(conf))
                ? (DummyRecordReader) input
                : null;
        application = new Application<K1, V1, K2, V2>(context, fakeInput);
    } catch (InterruptedException ie) {
        throw new RuntimeException("interrupted", ie);
    }
    DownwardProtocol<K1, V1> downlink = application.getDownlink();
    // FIXME: InputSplit is not Writable, but still, this is ugly...
    downlink.runMap((FileSplit) context.getInputSplit(), context.getNumReduceTasks(), isJavaInput);
    boolean skipping = conf.getBoolean(context.SKIP_RECORDS, false);
    boolean sent_input_types = false;
    try {
        if (isJavaInput) {
            // FIXME
            while (input.nextKeyValue()) {
                if (!sent_input_types) {
                    sent_input_types = true;
                    NullWritable n = NullWritable.get();
                    String kclass_name = n.getClass().getName();
                    String vclass_name = n.getClass().getName();
                    if (input.getCurrentKey() != null) {
                        kclass_name = input.getCurrentKey().getClass().getName();
                    }
                    if (input.getCurrentValue() != null) {
                        vclass_name = input.getCurrentValue().getClass().getName();
                    }
                    downlink.setInputTypes(kclass_name, vclass_name);
                }
                downlink.mapItem(input.getCurrentKey(), input.getCurrentValue());
                if (skipping) {
                    //flush the streams on every record input if running in skip mode
                    //so that we don't buffer other records surrounding a bad record.
                    downlink.flush();
                }
            }
            downlink.endOfInput();
        }
        application.waitForFinish();
    } catch (Throwable t) {
        application.abort(t);
    } finally {
        cleanup(context);
    }
}

From source file:it.crs4.seal.tsv_sort.TextSampler.java

License:Apache License

/**
 * Use the input splits to take samples of the input and generate sample
 * keys. By default reads 100,000 keys from 20 locations in the input, sorts
 * them and picks N-1 keys to generate N equally sized partitions.
 * @param inFormat The input to sample/* w w w  .  java 2s  .  c  o  m*/
 * @param conf the job to sample
 * @param partFile where to write the output file to
 * @throws IOException if something goes wrong
 */
public static void writePartitionFile(FileInputFormat<Text, Text> inFormat, JobContext job, Path partFile)
        throws IOException, InterruptedException {
    Configuration conf = job.getConfiguration();
    TaskAttemptContext taskContext = Utils.getTaskAttemptContext(conf);

    TextSampler sampler = new TextSampler();
    Text key = new Text();
    Text value = new Text();
    int partitions = job.getNumReduceTasks();
    long sampleSize = conf.getLong(SAMPLE_SIZE_CONF, SAMPLE_SIZE_DEFAULT);
    List<InputSplit> splits = inFormat.getSplits(job);
    int samples = Math.min(MAX_SLICES_SAMPLED, splits.size());
    long recordsPerSample = sampleSize / samples;
    int sampleStep = splits.size() / samples;
    long records = 0;
    // take N samples from different parts of the input
    for (int i = 0; i < samples; ++i) {
        InputSplit isplit = splits.get(sampleStep * i);
        RecordReader<Text, Text> reader = inFormat.createRecordReader(isplit, taskContext);
        reader.initialize(isplit, taskContext);
        while (reader.nextKeyValue()) {
            sampler.addKey(reader.getCurrentKey());
            records += 1;
            if ((i + 1) * recordsPerSample <= records) {
                break;
            }
        }
    }
    FileSystem outFs = partFile.getFileSystem(conf);
    if (outFs.exists(partFile))
        outFs.delete(partFile, false);

    SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class,
            NullWritable.class);
    NullWritable nullValue = NullWritable.get();
    for (Text split : sampler.createPartitions(partitions)) {
        writer.append(split, nullValue);
    }
    writer.close();
}