List of usage examples for org.apache.hadoop.mapred FileSplit FileSplit
public FileSplit(Path file, long start, long length, String[] hosts)
From source file:PageInputFormat.java
License:Apache License
protected FileSplit makeSplit(Path file, long start, long length, String[] hosts) { return new FileSplit(file, start, length, hosts); }
From source file:RunText.java
License:Apache License
public static void main(String[] args) throws Exception { o = new Options(); JCommander jc = null;/* w w w . j a v a 2 s . c o m*/ try { jc = new JCommander(o, args); jc.setProgramName("./runText"); } catch (ParameterException e) { System.out.println(e.getMessage()); String[] valid = { "-p", "path", "-d", "delimiter", "v", "value", "-i", "index" }; new JCommander(o, valid).usage(); System.exit(-1); } if (o.help) { jc.usage(); System.exit(0); } path = new Path(o.path); delim = o.delimiter.getBytes()[0]; toFind = o.value; index = o.index; numThreads = o.threads; Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); TextInputFormat format = new TextInputFormat(); long len = fs.getFileStatus(path).getLen() / numThreads; List<Thread> threads = Lists.newArrayList(); for (int i = 0; i < numThreads; i++) { FileSplit split = new FileSplit(path, i * len, len, new String[] { "" }); threads.add(new Thread(new RunText(split, format))); } runningThreads = new AtomicInteger(numThreads); for (Thread t : threads) { t.start(); } int prev = 0; int current; long t1 = System.nanoTime(); long t2; while (runningThreads.get() > 0) { Thread.sleep(5000); current = totalCount.get(); t2 = System.nanoTime(); System.out.println(String.format("%f records/sec", (current - prev) * 1e9 / (t2 - t1))); t1 = t2; prev = current; } for (Thread t : threads) { t.join(); } fs.close(); }
From source file:StreamWikiDumpInputFormat.java
License:Apache License
public FileSplit makeSplit(Path file, long start, long length, String[] hosts) { return new FileSplit(file, start, length, hosts); }
From source file:ca.sparkera.adapters.mapred.MainframeVBInputFormat.java
License:Apache License
/** * Splits files returned by {@link #listStatus(JobConf)} when they're too * big./*from www . j av a2s.c om*/ */ @Override @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { FileStatus[] files = listStatus(job); for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); FSDataInputStream fileIn; InputStream inputStream; fileIn = fs.open(path); inputStream = fileIn; filePosition = fileIn; long offset = 0; long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long bytesRemaining = length; long splitSize = 0; while (offset < length) { splitSize = computeSplitSize(goalSize, minSize, blockSize, inputStream); int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; offset = length - bytesRemaining; } if (bytesRemaining != 0) { throw new IOException( "Partial record(length = " + bytesRemaining + ") found at the end of file " + path); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } if (inputStream != null) { inputStream.close(); inputStream = null; } } java.util.Date date = new java.util.Date(); System.out.println((new Timestamp(date.getTime())) + ",\t Split = 100% Total Splits - " + (++splitCount) + "\t Total Records in VB file - " + totalRecords); LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:cascading.tap.hadoop.io.CombineFileRecordReaderWrapper.java
License:Open Source License
public CombineFileRecordReaderWrapper(CombineFileSplit split, Configuration conf, Reporter reporter, Integer idx) throws Exception { FileSplit fileSplit = new FileSplit(split.getPath(idx), split.getOffset(idx), split.getLength(idx), split.getLocations());/* w w w. j a va 2 s . c om*/ Class<?> clz = conf.getClass(INDIVIDUAL_INPUT_FORMAT, null); FileInputFormat<K, V> inputFormat = (FileInputFormat<K, V>) clz.newInstance(); if (inputFormat instanceof Configurable) ((Configurable) inputFormat).setConf(conf); delegate = inputFormat.getRecordReader(fileSplit, (JobConf) conf, reporter); }
From source file:com.alexholmes.hadooputils.combine.avro.mapred.CombineAvroInputFormatTest.java
License:Apache License
@SuppressWarnings("deprecation") public void testProjection() throws Exception { JobConf job = new JobConf(); Integer defaultRank = new Integer(-1); String jsonSchema = "{\"type\":\"record\"," + "\"name\":\"org.apache.avro.mapred.Pair\"," + "\"fields\": [ " + "{\"name\":\"rank\", \"type\":\"int\", \"default\": -1}," + "{\"name\":\"value\", \"type\":\"long\"}" + "]}"; Schema readerSchema = Schema.parse(jsonSchema); AvroJob.setInputSchema(job, readerSchema); String dir = System.getProperty("test.dir", ".") + "/mapred"; Path inputPath = new Path(dir + "/out" + "/part-00000" + AvroOutputFormat.EXT); FileStatus fileStatus = FileSystem.get(job).getFileStatus(inputPath); FileSplit fileSplit = new FileSplit(inputPath, 0, fileStatus.getLen(), job); AvroRecordReader<Pair<Integer, Long>> recordReader = new AvroRecordReader<Pair<Integer, Long>>(job, fileSplit);/*from www .j a v a 2 s .co m*/ AvroWrapper<Pair<Integer, Long>> inputPair = new AvroWrapper<Pair<Integer, Long>>(null); NullWritable ignore = NullWritable.get(); long sumOfCounts = 0; long numOfCounts = 0; while (recordReader.next(inputPair, ignore)) { Assert.assertEquals((Integer) inputPair.datum().get(0), defaultRank); sumOfCounts += (Long) inputPair.datum().get(1); numOfCounts++; } Assert.assertEquals(numOfCounts, WordCountUtil.COUNTS.size()); long actualSumOfCounts = 0; for (Long count : WordCountUtil.COUNTS.values()) { actualSumOfCounts += count; } Assert.assertEquals(sumOfCounts, actualSumOfCounts); }
From source file:com.alexholmes.hadooputils.combine.common.mapred.CommonCombineRecordReader.java
License:Apache License
/** * Moves on to the next split inside {@link #split}. The {@link #reader} will be {@code null} * once we have exhausted all the splits. * * @return true if we successfully moved on to the next split * @throws java.io.IOException if we hit io errors *///from w w w . jav a 2s. c o m public boolean nextReader() throws IOException { // close the current reader and set it to null close(); currentSplit++; if (currentSplit >= split.getPaths().length) { // hit the end of the line return false; } FileSplit fileSplit = new FileSplit(split.getPath(currentSplit), split.getOffset(currentSplit), split.getLength(currentSplit), split.getLocations() == null || split.getLocations().length - 1 < currentSplit ? null : new String[] { split.getLocations()[currentSplit] }); reader = engineerer.createRecordReader(conf, fileSplit); return true; }
From source file:com.aliyun.fs.utils.OssInputUtils.java
License:Apache License
public FileSplit[] getSplits(String file, int numSplits) throws IOException { Path path = new Path(file); this.fs = FileSystem.get(path.toUri(), conf); fs.initialize(path.toUri(), conf);/*from w w w . j a v a2 s .c o m*/ FileStatus[] files = fs.listStatus(path); long totalSize = 0; for (FileStatus file1 : files) { if (file1.isDirectory()) { throw new IOException("Not a file: " + file1.getPath()); } totalSize += file1.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math .max(conf.getLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE, 1), 1); ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); for (FileStatus file2 : files) { Path fp = file2.getPath(); long length = file2.getLen(); if (length != 0) { long splitSize = Math.max(minSize, goalSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { FileSplit split = new FileSplit(fp, length - bytesRemaining, splitSize, new String[0]); splits.add(split); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { FileSplit split = new FileSplit(fp, length - bytesRemaining, bytesRemaining, new String[0]); splits.add(split); } } } LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:com.bianfeng.bfas.hive.io.RealtimeInputFormat2.java
License:Apache License
/** Splits files returned by {@link #listStatus(JobConf)} when * they're too big.*/// w w w . j a va 2 s. com @SuppressWarnings("deprecation") public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { LOG.warn("test go go go"); FileStatus[] files = listStatus(job); // Save the number of input files in the job-conf job.setLong(NUM_INPUT_FILES, files.length); long totalSize = 0; // compute total size for (FileStatus file : files) { // check we have valid files if (file.isDir()) { throw new IOException("Not a file: " + file.getPath()); } totalSize += file.getLen(); } long goalSize = totalSize / (numSplits == 0 ? 1 : numSplits); long minSize = Math.max(job.getLong("mapred.min.split.size", 1), minSplitSize); // generate splits ArrayList<FileSplit> splits = new ArrayList<FileSplit>(numSplits); NetworkTopology clusterMap = new NetworkTopology(); for (FileStatus file : files) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(goalSize, minSize, blockSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { String[] splitHosts = getSplitHosts(blkLocations, length - bytesRemaining, splitSize, clusterMap); splits.add(new FileSplit(path, length - bytesRemaining, splitSize, splitHosts)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new FileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { String[] splitHosts = getSplitHosts(blkLocations, 0, length, clusterMap); splits.add(new FileSplit(path, 0, length, splitHosts)); } else { //Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); return splits.toArray(new FileSplit[splits.size()]); }
From source file:com.conductor.s3.S3InputFormatUtils.java
License:Apache License
/** * Converts the {@link org.apache.hadoop.fs.FileStatus}s to {@link org.apache.hadoop.mapred.InputSplit}s (MRV1 API). * <p>//w w w . ja v a 2 s. c o m * This is taken directly from {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}, less any file system * operations that do not make sense when using {@code S3}. * * @param files * the files to convert * @param minSize * the minimum size of the splits * @param maxSize * the maximum size of the splits * @return the splits. */ static List<InputSplit> convertToInputSplitsMRV1(final Iterable<FileStatus> files, final long minSize, final long maxSize) { final List<InputSplit> splits = Lists.newArrayList(); for (final FileStatus file : files) { // check for valid data for this input format checkArgument(!file.isDirectory(), "Cannot pass directories to this method!"); final String path = file.getPath().toString(); checkArgument(path.startsWith("s3:") || path.startsWith("s3n:"), "Expected S3 input"); // create splits out of file final long length = file.getLen(); if (length > 0) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { splits.add(new FileSplit(file.getPath(), length - bytesRemaining, splitSize, S3_SPLIT_HOST)); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit(file.getPath(), length - bytesRemaining, bytesRemaining, S3_SPLIT_HOST)); } } } return splits; }