List of usage examples for org.apache.hadoop.mapreduce InputSplit getLength
public abstract long getLength() throws IOException, InterruptedException;
From source file:com.marklogic.contentpump.CompressedRDFReader.java
License:Apache License
@Override protected void initStream(InputSplit inSplit) throws IOException, InterruptedException { setFile(((FileSplit) inSplit).getPath()); FSDataInputStream fileIn = fs.open(file); URI zipURI = file.toUri();//from ww w . ja va 2 s. c om String codecString = conf.get(ConfigConstants.CONF_INPUT_COMPRESSION_CODEC, CompressionCodec.ZIP.toString()); if (codecString.equalsIgnoreCase(CompressionCodec.ZIP.toString())) { zipIn = new ZipInputStream(fileIn); codec = CompressionCodec.ZIP; while (true) { try { currZipEntry = ((ZipInputStream) zipIn).getNextEntry(); if (currZipEntry == null) { break; } if (currZipEntry.getSize() != 0) { subId = currZipEntry.getName(); break; } } catch (IllegalArgumentException e) { LOG.warn("Skipped a zip entry in : " + file.toUri() + ", reason: " + e.getMessage()); } } if (currZipEntry == null) { // no entry in zip LOG.warn("No valid entry in zip:" + file.toUri()); return; } ByteArrayOutputStream baos; long size = currZipEntry.getSize(); if (size == -1) { baos = new ByteArrayOutputStream(); // if we don't know the size, assume it's big! initParser(zipURI.toASCIIString() + "/" + subId, INMEMORYTHRESHOLD); } else { baos = new ByteArrayOutputStream((int) size); initParser(zipURI.toASCIIString() + "/" + subId, size); } int nb; while ((nb = zipIn.read(buf, 0, buf.length)) != -1) { baos.write(buf, 0, nb); } parse(subId, new ByteArrayInputStream(baos.toByteArray())); } else if (codecString.equalsIgnoreCase(CompressionCodec.GZIP.toString())) { long size = inSplit.getLength(); zipIn = new GZIPInputStream(fileIn); codec = CompressionCodec.GZIP; initParser(zipURI.toASCIIString(), size * COMPRESSIONFACTOR); parse(file.getName(), zipIn); } else { throw new UnsupportedOperationException("Unsupported codec: " + codec.name()); } }
From source file:com.marklogic.contentpump.DelimitedJSONReader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { /* Initialization in super class */ initConfig(context);/* w w w . ja v a 2 s . c o m*/ /* Get file(s) in input split */ setFile(((FileSplit) inSplit).getPath()); // Initialize reader properties generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false); if (generateId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriName = conf.get(CONF_INPUT_URI_ID, null); mapper = new ObjectMapper(); } bytesRead = 0; totalBytes = inSplit.getLength(); /* Check file status */ fs = file.getFileSystem(context.getConfiguration()); FileStatus status = fs.getFileStatus(file); if (status.isDirectory()) { iterator = new FileIterator((FileSplit) inSplit, context); inSplit = iterator.next(); } /* Initialize buffered reader */ initFileStream(inSplit); }
From source file:com.marklogic.contentpump.DelimitedTextReader.java
License:Apache License
protected void initParser(InputSplit inSplit) throws IOException, InterruptedException { setFile(((FileSplit) inSplit).getPath()); configFileNameAsCollection(conf, file); fileIn = fs.open(file);/*www . j a v a 2 s . c om*/ instream = new InputStreamReader(fileIn, encoding); bytesRead = 0; fileLen = inSplit.getLength(); if (uriName == null) { generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false); if (generateId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriId = 0; } } parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, encapsulator, true, true)); parserIterator = parser.iterator(); }
From source file:com.marklogic.contentpump.RDFReader.java
License:Apache License
protected void initStream(InputSplit inSplit) throws IOException, InterruptedException { setFile(((FileSplit) inSplit).getPath()); long size = inSplit.getLength(); initParser(file.toUri().toASCIIString(), size); parse(file.getName());/* w w w . j ava 2s . com*/ }
From source file:com.marklogic.contentpump.SplitDelimitedTextReader.java
License:Apache License
@Override protected void initParser(InputSplit inSplit) throws IOException, InterruptedException { setFile(((DelimitedSplit) inSplit).getPath()); configFileNameAsCollection(conf, file); // get header from the DelimitedSplit TextArrayWritable taw = ((DelimitedSplit) inSplit).getHeader(); fields = taw.toStrings();//ww w . j av a2 s. c om try { docBuilder.configFields(conf, fields); } catch (IllegalArgumentException e) { LOG.error("Skipped file: " + file.toUri() + ", reason: " + e.getMessage()); return; } fileIn = fs.open(file); lineSeparator = retrieveLineSeparator(fileIn); if (start != 0) { // in case the cut point is \n, back off 1 char to create a partial // line so that 1st line can be skipped start--; } fileIn.seek(start); instream = new InputStreamReader(fileIn, encoding); bytesRead = 0; fileLen = inSplit.getLength(); if (uriName == null) { generateId = conf.getBoolean(CONF_INPUT_GENERATE_URI, false); if (generateId) { idGen = new IdGenerator(file.toUri().getPath() + "-" + ((FileSplit) inSplit).getStart()); } else { uriId = 0; } } boolean found = generateId || uriId == 0; for (int i = 0; i < fields.length && !found; i++) { if (fields[i].equals(uriName)) { uriId = i; found = true; break; } } if (found == false) { // idname doesn't match any columns LOG.error("Skipped file: " + file.toUri() + ", reason: " + URI_ID + " " + uriName + " is not found"); return; } // keep leading and trailing whitespaces to ensure accuracy of pos // do not skip empty line just in case the split boundary is \n parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, encapsulator, false, false)); parserIterator = parser.iterator(); // skip first line: // 1st split, skip header; other splits, skip partial line if (parserIterator.hasNext()) { String[] values = getLine(); start += getBytesCountFromLine(values); pos = start; } }
From source file:com.marklogic.mapreduce.examples.ContentLoader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { bytesTotal = inSplit.getLength(); Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); key.set(file.toString());//ww w.ja va 2 s . c om byte[] buf = new byte[(int) inSplit.getLength()]; try { fileIn.readFully(buf); value.set(buf); hasNext = true; } catch (Exception e) { hasNext = false; } finally { fileIn.close(); } }
From source file:com.marklogic.mapreduce.examples.WikiLoader.java
License:Apache License
@Override public void initialize(InputSplit inSplit, TaskAttemptContext context) throws IOException, InterruptedException { Path file = ((FileSplit) inSplit).getPath(); FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(file); byte[] buf = new byte[BUFFER_SIZE]; long bytesTotal = inSplit.getLength(); long start = ((FileSplit) inSplit).getStart(); fileIn.seek(start);/*from w ww.j a v a2 s . co m*/ long bytesRead = 0; StringBuilder pages = new StringBuilder(); int sindex = -1; while (true) { int length = (int) Math.min(bytesTotal - bytesRead, buf.length); int read = fileIn.read(buf, 0, length); if (read == -1) { System.out.println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); break; } bytesRead += read; String temp = new String(new String(buf, 0, read)); if (sindex == -1) { // haven't found the start yet sindex = temp.indexOf(BEGIN_PAGE_TAG); if (sindex > -1) { pages.append(temp.substring(sindex)); } } else if (bytesRead < bytesTotal) { // haven't completed the split pages.append(temp); } else { // reached the end of this split // look for end int eindex = 0; if (temp.contains(END_DOC_TAG) || // reached the end of doc temp.endsWith(END_PAGE_TAG)) { eindex = temp.lastIndexOf(END_PAGE_TAG); pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); System.out.println("Found end of doc."); } else { // need to read ahead to look for end of page while (true) { read = fileIn.read(buf, 0, READ_AHEAD_SIZE); if (read == -1) { // no more to read System.out .println("Unexpected EOF: bytesTotal=" + bytesTotal + "bytesRead=" + bytesRead); System.out.println(temp); break; } bytesRead += read; // look for end temp = new String(buf, 0, read); eindex = temp.indexOf(END_PAGE_TAG); if (eindex > -1) { pages.append(temp.substring(0, eindex + END_PAGE_TAG.length())); break; } else { pages.append(temp); } } } break; } } fileIn.close(); articles = WikiModelProcessor.process(pages); }
From source file:com.tomslabs.grid.avro.AvroRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; Configuration config = context.getConfiguration(); Path path = fileSplit.getPath(); this.in = new FsInput(path, config); DatumReader<T> datumReader = getDatumReader(config); this.reader = new DataFileReader<T>(in, datumReader); reader.sync(fileSplit.getStart()); // sync to start this.start = in.tell(); this.end = fileSplit.getStart() + split.getLength(); }
From source file:com.twitter.elephanttwin.retrieval.IndexedFilterRecordReader.java
License:Apache License
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { this.context = context; if (split.getLength() <= 0) { filterReader = null;//from w w w. j av a2s.c o m return; } fileSplits = computeFileSplits(split, context); subBlockCnt = fileSplits.size(); if (subBlockCnt == 0) { filterReader = null; return; } currentSubBlock = 1; filterReader.initialize(fileSplits.remove(0), context); }
From source file:edu.umn.cs.spatialHadoop.mapreduce.RTreeRecordReader3.java
License:Open Source License
public void initialize(InputSplit split, Configuration conf) throws IOException, InterruptedException { LOG.info("Open a SpatialRecordReader to split: " + split); FileSplit fsplit = (FileSplit) split; this.path = fsplit.getPath(); this.start = fsplit.getStart(); this.end = this.start + split.getLength(); this.fs = this.path.getFileSystem(conf); this.directIn = fs.open(this.path); codec = new CompressionCodecFactory(conf).getCodec(this.path); if (codec != null) { // Input is compressed, create a decompressor to decompress it decompressor = CodecPool.getDecompressor(codec); if (codec instanceof SplittableCompressionCodec) { // A splittable compression codec, can seek to the desired input pos final SplitCompressionInputStream cIn = ((SplittableCompressionCodec) codec).createInputStream( directIn, decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK); in = new DataInputStream(cIn); start = cIn.getAdjustedStart(); end = cIn.getAdjustedEnd();/* w ww .j ava 2 s .c om*/ // take pos from compressed stream as we adjusted both start and end // to match with the compressed file filePosition = cIn; } else { // Non-splittable input, need to start from the beginning CompressionInputStream cIn = codec.createInputStream(directIn, decompressor); in = new DataInputStream(cIn); filePosition = cIn; } } else { // Non-compressed file, seek to the desired position and use this stream // to get the progress and position directIn.seek(start); in = directIn; filePosition = directIn; } byte[] signature = new byte[8]; in.readFully(signature); if (!Arrays.equals(signature, SpatialSite.RTreeFileMarkerB)) { throw new RuntimeException("Incorrect signature for RTree"); } this.stockShape = (V) OperationsParams.getShape(conf, "shape"); if (conf.get(SpatialInputFormat3.InputQueryRange) != null) { // Retrieve the input query range to apply on all records this.inputQueryRange = OperationsParams.getShape(conf, SpatialInputFormat3.InputQueryRange); this.inputQueryMBR = this.inputQueryRange.getMBR(); } // Check if there is an associated global index to read cell boundaries GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, path.getParent()); if (gindex == null) { cellMBR = new Partition(); cellMBR.invalidate(); } else { // Set from the associated partition in the global index for (Partition p : gindex) { if (p.filename.equals(this.path.getName())) cellMBR = p; } } }