List of usage examples for org.apache.hadoop.mapred FileSplit getPath
public Path getPath()
From source file:DeprecatedBAMInputFormat.java
License:Open Source License
public static List<org.apache.hadoop.mapreduce.InputSplit> undeprecateSplits(InputSplit[] splits) throws IOException { final List<org.apache.hadoop.mapreduce.InputSplit> undeprecated = new ArrayList<org.apache.hadoop.mapreduce.InputSplit>( splits.length);/*from w w w . j a v a2 s. c o m*/ for (final InputSplit s : splits) { final FileSplit f = (FileSplit) s; undeprecated.add(new org.apache.hadoop.mapreduce.lib.input.FileSplit(f.getPath(), f.getStart(), f.getLength(), f.getLocations())); } return undeprecated; }
From source file:StreamWikiDumpInputFormat.java
License:Apache License
public RecordReader<Text, Text> getRecordReader(final InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException { // handling non-standard record reader (likely StreamXmlRecordReader) FileSplit split = (FileSplit) genericSplit; LOG.info("getRecordReader start.....split=" + split); reporter.setStatus(split.toString()); // Open the file and seek to the start of the split FileSystem fs = split.getPath().getFileSystem(job); String patt = job.get(KEY_EXCLUDE_PAGE_PATTERN); boolean prev = job.getBoolean(KEY_PREVIOUS_REVISION, true); return new MyRecordReader(split, reporter, job, fs, patt != null && !"".equals(patt) ? Pattern.compile(patt) : null, prev); }
From source file:DeprecatedBAMRecordReader.java
License:Open Source License
public DeprecatedBAMRecordReader(InputSplit split, final JobConf job, Reporter reporter) throws IOException { if (split instanceof DeprecatedFileVirtualSplit) { rr.initialize(((DeprecatedFileVirtualSplit) split).vs, new FakeTaskAttemptContext(job)); splitLength = split.getLength(); return;// w ww . j a v a 2s .c o m } if (split instanceof FileSplit) { // XXX XXX // XXX XXX // XXX // XXX XXX // XXX XXX // // Hive gives us its own custom FileSplits for some reason, so we have // to do our own split alignment. (Sometimes, anyway; for "select // count(*) from table" we get FileSplits here, but for "select * from // table" our input format is used directly. Perhaps it's only because // the latter doesn't spawn a MapReduce job, so getting a FileSplit // here is the common case.) // // Since we get only one split at a time here, this is very poor: we // have to open the file for every split, even if it's the same file // every time. // // This should always work, but might be /very/ slow. I can't think of // a better way. final FileSplit fspl = (FileSplit) split; final Path path = fspl.getPath(); final long beg = fspl.getStart(); final long end = beg + fspl.getLength(); final SeekableStream sin = WrapSeekable.openPath(path.getFileSystem(job), path); final BAMSplitGuesser guesser = new BAMSplitGuesser(sin); final long alignedBeg = guesser.guessNextBAMRecordStart(beg, end); sin.close(); if (alignedBeg == end) throw new IOException("Guesser found nothing after pos " + beg); final long alignedEnd = end << 16 | 0xffff; splitLength = (alignedEnd - alignedBeg) >> 16; rr.initialize(new FileVirtualSplit(path, alignedBeg, alignedEnd, fspl.getLocations()), new FakeTaskAttemptContext(job)); return; } throw new ClassCastException("Can only handle DeprecatedFileVirtualSplit and FileSplit"); }
From source file:SeekableInputStream.java
License:Apache License
public static SeekableInputStream getInstance(FileSplit split, FileSystem fs, CompressionCodecFactory compressionCodecs) throws IOException { return getInstance(split.getPath(), split.getStart(), split.getStart() + split.getLength(), fs, compressionCodecs);/*from ww w . j a v a2s. c om*/ }
From source file:alluxio.hadoop.HadoopUtils.java
License:Apache License
/** * Returns a string representation of a Hadoop {@link FileSplit}. * * @param fs Hadoop {@link FileSplit}/*from ww w . j a va 2 s. c om*/ * @return its string representation */ public static String toStringHadoopFileSplit(FileSplit fs) { StringBuilder sb = new StringBuilder(); sb.append("HadoopFileSplit: Path: ").append(fs.getPath()); sb.append(" , Start: ").append(fs.getStart()); sb.append(" , Length: ").append(fs.getLength()); sb.append(" , Hosts: "); String[] locs; try { locs = fs.getLocations(); } catch (IOException e) { LOG.error(e.getMessage()); locs = new String[] {}; } for (String loc : locs) { sb.append(loc).append("; "); } return sb.toString(); }
From source file:br.ufrj.nce.recureco.distributedindex.indexer.IndexerMap.java
License:Open Source License
public void map(LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { FileSplit fileSplit = (FileSplit) reporter.getInputSplit(); String filename = fileSplit.getPath().getName(); List<String> tokenizedLine = lineTokenizer.tokenize(value.toString()); for (String auxWord : tokenizedLine) { output.collect(new Text(auxWord), new Text(filename)); }//from w ww . ja v a 2s . co m }
From source file:ca.sparkera.adapters.mapred.MainframeVBRecordReader.java
License:Apache License
public MainframeVBRecordReader(Configuration job, FileSplit split) throws IOException { reader = new ca.sparkera.adapters.mapreduce.MainframeVBRecordReader(); reader.initialize(job, split.getStart(), split.getLength(), split.getPath()); }
From source file:com.alexholmes.hadooputils.sort.DelimitedLineRecordReader.java
License:Apache License
protected void initialize(Configuration job, FileSplit split) throws IOException { this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength", Integer.MAX_VALUE); start = split.getStart();/*from w w w . j a v a 2 s.co m*/ end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split FileSystem fs = file.getFileSystem(job); fileIn = fs.open(split.getPath()); boolean skipFirstLine = false; String rowDelim = job.get("textinputformat.record.delimiter", null); if (codec != null) { if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(codec.createInputStream(fileIn), job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } end = Long.MAX_VALUE; } else { if (start != 0) { skipFirstLine = true; --start; fileIn.seek(start); } if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(fileIn, job); } } if (skipFirstLine) { // skip first line and re-establish "start". start += in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start)); } this.pos = start; }
From source file:com.alexholmes.hadooputils.sort.LzoDelimitedLineRecordReader.java
License:Apache License
@Override protected void initialize(Configuration job, FileSplit split) throws IOException { start = split.getStart();//from w w w .j ava 2 s . c o m end = start + split.getLength(); final Path file = split.getPath(); FileSystem fs = file.getFileSystem(job); CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job); final CompressionCodec codec = compressionCodecs.getCodec(file); if (codec == null) { throw new IOException("No codec for file " + file + " not found, cannot run"); } // open the file and seek to the start of the split fileIn = fs.open(split.getPath()); // creates input stream and also reads the file header String rowDelim = job.get("textinputformat.record.delimiter", null); if (rowDelim != null) { byte[] hexcode = SortConfig.getHexDelimiter(rowDelim); in = new DelimitedLineReader(fileIn, job, (hexcode != null) ? hexcode : rowDelim.getBytes()); } else { in = new DelimitedLineReader(codec.createInputStream(fileIn), job); } if (start != 0) { fileIn.seek(start); // read and ignore the first line in.readLine(new Text()); start = fileIn.getPos(); } this.pos = start; }
From source file:com.aliyun.fs.oss.common.OssRecordReader.java
License:Apache License
public OssRecordReader(Configuration job, FileSplit split, FileSystem fs, byte[] recordDelimiter) throws IOException { this.maxLineLength = job.getInt(org.apache.hadoop.mapreduce.lib.input.LineRecordReader.MAX_LINE_LENGTH, Integer.MAX_VALUE);/*from www . j ava 2 s . c om*/ start = split.getStart(); end = start + split.getLength(); final Path file = split.getPath(); compressionCodecs = new CompressionCodecFactory(job); codec = compressionCodecs.getCodec(file); // open the file and seek to the start of the split fileIn = fs.open(file); if (isCompressedInput()) { decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( fileIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new LineReader(cIn, job, recordDelimiter); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd(); filePosition = cIn; // take pos from compressed stream } else { in = new LineReader(codec.createInputStream(fileIn, decompressor), job, recordDelimiter); filePosition = fileIn; } } else { fileIn.seek(start); in = new LineReader(fileIn, job, recordDelimiter); filePosition = fileIn; } // If this is not the first split, we always throw away first record // because we always (except the last split) read one extra line in // next() method. if (start != 0) { start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } this.pos = start; }