List of usage examples for org.apache.hadoop.mapred FileSplit getStart
public long getStart()
From source file:DeprecatedBAMInputFormat.java
License:Open Source License
public static List<org.apache.hadoop.mapreduce.InputSplit> undeprecateSplits(InputSplit[] splits) throws IOException { final List<org.apache.hadoop.mapreduce.InputSplit> undeprecated = new ArrayList<org.apache.hadoop.mapreduce.InputSplit>( splits.length);/*from w w w .jav a2s. c om*/ for (final InputSplit s : splits) { final FileSplit f = (FileSplit) s; undeprecated.add(new org.apache.hadoop.mapreduce.lib.input.FileSplit(f.getPath(), f.getStart(), f.getLength(), f.getLocations())); } return undeprecated; }
From source file:StreamWikiDumpInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobConf job, FileStatus file, String pattern, long splitSize) throws IOException { NetworkTopology clusterMap = new NetworkTopology(); List<InputSplit> splits = new ArrayList<InputSplit>(); Path path = file.getPath();//from ww w . jav a 2 s . com long length = file.getLen(); FileSystem fs = file.getPath().getFileSystem(job); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(fs, path)) { long bytesRemaining = length; SeekableInputStream in = SeekableInputStream.getInstance(path, 0, length, fs, this.compressionCodecs); InputStream is = null; long start = 0; long skip = 0; if (is != null) { // start = is.getAdjustedStart(); // length = is.getAdjustedEnd(); is.close(); in = null; } LOG.info("locations=" + Arrays.asList(blkLocations)); FileSplit split = null; Set<Long> processedPageEnds = new HashSet<Long>(); float factor = job.getFloat(KEY_SKIP_FACTOR, 1.2F); READLOOP: while (((double) bytesRemaining) / splitSize > factor && bytesRemaining > 0) { // prepare matcher ByteMatcher matcher; { long st = Math.min(start + skip + splitSize, length - 1); split = makeSplit(path, st, Math.min(splitSize, length - st), clusterMap, blkLocations); System.err.println("split move to: " + split); if (in != null) in.close(); if (split.getLength() <= 1) { break; } in = SeekableInputStream.getInstance(split, fs, this.compressionCodecs); // SplitCompressionInputStream cin = // in.getSplitCompressionInputStream(); } matcher = new ByteMatcher(in); // read until the next page end in the look-ahead split boolean reach = false; while (!matcher.readUntilMatch(pageEndPattern, null, split.getStart() + split.getLength())) { if (matcher.getPos() >= length || split.getLength() == length - split.getStart()) break READLOOP; reach = false; split = makeSplit(path, split.getStart(), Math.min(split.getLength() + splitSize, length - split.getStart()), clusterMap, blkLocations); System.err.println("split extend to: " + split); } System.err.println( path + ": #" + splits.size() + " " + pageEndPattern + " found: pos=" + matcher.getPos() + " last=" + matcher.getLastUnmatchPos() + " read=" + matcher.getReadBytes() + " current=" + start + " remaining=" + bytesRemaining + " split=" + split); if (matcher.getLastUnmatchPos() > 0 && matcher.getPos() > matcher.getLastUnmatchPos() && !processedPageEnds.contains(matcher.getPos())) { splits.add(makeSplit(path, start, matcher.getPos() - start, clusterMap, blkLocations)); processedPageEnds.add(matcher.getPos()); long newstart = Math.max(matcher.getLastUnmatchPos(), start); bytesRemaining = length - newstart; start = newstart; skip = 0; } else { skip = matcher.getPos() - start; } } if (bytesRemaining > 0 && !processedPageEnds.contains(length)) { System.err.println( pageEndPattern + " remaining: pos=" + (length - bytesRemaining) + " end=" + length); splits.add(makeSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } if (in != null) in.close(); } else if (length != 0) { splits.add(makeSplit(path, 0, length, clusterMap, blkLocations)); } else { // Create empty hosts array for zero length files splits.add(makeSplit(path, 0, length, new String[0])); } return splits; }
From source file:StreamWikiDumpInputFormat.java
License:Apache License
private static List<Long> getPageBytes(FileSplit split, FileSystem fs, CompressionCodecFactory compressionCodecs, Reporter reporter) throws IOException { SeekableInputStream in = null;/* ww w. j a va2 s . c o m*/ try { in = SeekableInputStream.getInstance(split, fs, compressionCodecs); long start = split.getStart(); long end = start + split.getLength(); InputStream cin = null; if (cin != null) { // start = cin.getAdjustedStart(); // end = cin.getAdjustedEnd() + 1; } ByteMatcher matcher = new ByteMatcher(in, in); List<Long> ret = new ArrayList<Long>(); while (true) { if (matcher.getPos() >= end || !matcher.readUntilMatch(pageBeginPattern, null, end)) { break; } ret.add(matcher.getReadBytes() - pageBeginPattern.getBytes("UTF-8").length); if (matcher.getPos() >= end || !matcher.readUntilMatch(pageEndPattern, null, end)) { System.err.println("could not find " + pageEndPattern + ", page over a split? pos=" + matcher.getPos() + " bytes=" + matcher.getReadBytes()); // ret.add(end); break; } ret.add(matcher.getReadBytes() - pageEndPattern.getBytes("UTF-8").length); String report = String.format( "StreamWikiDumpInputFormat: find page %6d start=%d pos=%d end=%d bytes=%d", ret.size(), start, matcher.getPos(), end, matcher.getReadBytes()); reporter.setStatus(report); reporter.incrCounter(WikiDumpCounters.FOUND_PAGES, 1); LOG.info(report); } if (ret.size() % 2 == 0) { ret.add(matcher.getReadBytes()); } // System.err.println("getPageBytes " + ret);//! return ret; } finally { if (in != null) { in.close(); } } }
From source file:DeprecatedBAMRecordReader.java
License:Open Source License
public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException { if (split instanceof DeprecatedFileVirtualSplit) { rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job)); splitLength = split.getLength(); return;//from ww w. ja v a 2s.com } if (split instanceof FileSplit) { // XXX XXX // XXX XXX // XXX // XXX XXX // XXX XXX // // Hive gives us its own custom FileSplits for some reason, so we have // to do our own split alignment. (Sometimes, anyway; for "select // count(*) from table" we get FileSplits here, but for "select * from // table" our input format is used directly. Perhaps it's only because // the latter doesn't spawn a MapReduce job, so getting a FileSplit // here is the common case.) // // Since we get only one split at a time here, this is very poor: we // have to open the file for every split, even if it's the same file // every time. // // This should always work, but might be /very/ slow. I can't think of // a better way. final FileSplit fspl = (FileSplit) split; final Path path = fspl.getPath(); final long beg = fspl.getStart(); final long end = beg + fspl.getLength(); final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path); final BAMSplitGuesser guesser = new BAMSplitGuesser(sin); final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end); sin.close(); if (alignedBeg == end) throw new IOException("Guesser found nothing after pos " + beg); final long alignedEnd = end << 16 | 0xffff; splitLength = (alignedEnd - alignedBeg) >> 16; rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()), new FakeTaskAttemptContext(job)); return; } throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit"); }
From source file:SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(FileSplit split, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { return getInstance(split.getPath(), split.getStart(), split.getStart() + split.getLength(), fs, compressionCodecs);//from ww w.j av a2s. c o m }
From source file:alluxio.hadoop.HadoopUtils.java
License:Apache License
/** * Returns a string representation of a Hadoop {@link FileSplit}. * * @param fs Hadoop {@link FileSplit}/*ww w . j a va 2 s. com*/ * @return its string representation */ public static String toStringHadoopFileSplit(FileSplit fs) { StringBuilder sb = new StringBuilder(); sb.append("HadoopFileSplit: Path: ").append(fs.getPath()); sb.append(" , Start: ").append(fs.getStart()); sb.append(" , Length: ").append(fs.getLength()); sb.append(" , Hosts: "); String[] locs; try { locs = fs.getLocations(); } catch (IOException e) { LOG.error(e.getMessage()); locs = new String[] {}; } for (String loc : locs) { sb.append(loc).append("; "); } return sb.toString(); }
From source file:ca.sparkera.adapters.mapred.MainframeVBRecordReader.java
License:Apache License
public MainframeVBRecordReader(Configuration job, FileSplit split) throws IOException { reader = new ca.sparkera.adapters.mapreduce.MainframeVBRecordReader(); reader.initialize(job, split.getStart(), split.getLength(), split.getPath()); }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java
License:Apache License
protected void initialize(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart(); end = start + split.getLength();/*ww w. ja v a 2 s . c o m*/ final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; String rowDelim = job.get("textinputformat.record.delimiter", null); if (codec != null) { if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(codec.createInputStream(fileIn), job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(fileIn, job); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java
License:Apache License
@Override protected void initialize(Configuration job, FileSplit split) throws IOException { start = split.getStart(); end = start + split.getLength();/*from ww w. j a va 2s. c om*/ final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header String rowDelim = job.get("textinputformat.record.delimiter", null); if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.aliyun.fs.oss.common.OssRecordReader.java
License:Apache License
public OssRecordReader(Configuration job, FileSplit split, FileSystem fs, byte[] recordDelimiter) throws IOException { this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);/*ww w.j av a 2 s .com*/ start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job, recordDelimiter); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiter); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job, recordDelimiter); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }