List of usage examples for org.apache.hadoop.mapreduce.lib.input FileSplit getLocations
@Override public String[] getLocations() throws IOException
From source file:bsc.spark.examples.terasort.ehiggs.TeraScheduler.java
License:Apache License
public TeraScheduler(FileSplit[] realSplits, Configuration conf) throws IOException { this.realSplits = realSplits; this.slotsPerHost = conf.getInt(TTConfig.TT_MAP_SLOTS, 4); Map<String, Host> hostTable = new HashMap<String, Host>(); splits = new Split[realSplits.length]; for (FileSplit realSplit : realSplits) { Split split = new Split(realSplit.getPath().toString()); splits[remainingSplits++] = split; for (String hostname : realSplit.getLocations()) { Host host = hostTable.get(hostname); if (host == null) { host = new Host(hostname); hostTable.put(hostname, host); hosts.add(host);/*ww w . j ava 2 s .c o m*/ } host.splits.add(split); split.locations.add(host); } } }
From source file:com.hadoop.mapreduce.FourMcInputFormat.java
License:BSD License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { Configuration conf = HadoopUtils.getConfiguration(job); List<InputSplit> defaultSplits = super.getSplits(job); List<InputSplit> result = new ArrayList<InputSplit>(); Path prevFile = null;/*from ww w.ja v a 2s .co m*/ FourMcBlockIndex prevIndex = null; for (InputSplit genericSplit : defaultSplits) { // Load the index. FileSplit fileSplit = (FileSplit) genericSplit; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); FourMcBlockIndex index; if (file.equals(prevFile)) { index = prevIndex; } else { index = FourMcBlockIndex.readIndex(fs, file); prevFile = file; prevIndex = index; } if (index == null) { throw new IOException("BlockIndex unreadable for " + file); } if (index.isEmpty()) { // leave the default split for empty block index result.add(fileSplit); continue; } long start = fileSplit.getStart(); long end = start + fileSplit.getLength(); long fourMcStart = index.alignSliceStartToIndex(start, end); long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen()); if (fourMcStart != FourMcBlockIndex.NOT_FOUND && fourMcEnd != FourMcBlockIndex.NOT_FOUND) { result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations())); LOG.debug("Added 4mc split for " + file + "[start=" + fourMcStart + ", length=" + (fourMcEnd - fourMcStart) + "]"); } } return result; }
From source file:com.hadoop.mapreduce.FourMzInputFormat.java
License:BSD License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { Configuration conf = HadoopUtils.getConfiguration(job); List<InputSplit> defaultSplits = super.getSplits(job); List<InputSplit> result = new ArrayList<InputSplit>(); Path prevFile = null;//from w w w .j av a 2 s . c o m FourMzBlockIndex prevIndex = null; for (InputSplit genericSplit : defaultSplits) { // Load the index. FileSplit fileSplit = (FileSplit) genericSplit; Path file = fileSplit.getPath(); FileSystem fs = file.getFileSystem(conf); FourMzBlockIndex index; if (file.equals(prevFile)) { index = prevIndex; } else { index = FourMzBlockIndex.readIndex(fs, file); prevFile = file; prevIndex = index; } if (index == null) { throw new IOException("BlockIndex unreadable for " + file); } if (index.isEmpty()) { // leave the default split for empty block index result.add(fileSplit); continue; } long start = fileSplit.getStart(); long end = start + fileSplit.getLength(); long fourMcStart = index.alignSliceStartToIndex(start, end); long fourMcEnd = index.alignSliceEndToIndex(end, fs.getFileStatus(file).getLen()); if (fourMcStart != FourMzBlockIndex.NOT_FOUND && fourMcEnd != FourMzBlockIndex.NOT_FOUND) { result.add(new FileSplit(file, fourMcStart, fourMcEnd - fourMcStart, fileSplit.getLocations())); LOG.debug("Added 4mz split for " + file + "[start=" + fourMcStart + ", length=" + (fourMcEnd - fourMcStart) + "]"); } } return result; }
From source file:com.hadoop.mapreduce.LzoTextInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); // find new start/ends of the filesplit that aligns // with the lzo blocks List<InputSplit> result = new ArrayList<InputSplit>(); FileSystem fs = FileSystem.get(job.getConfiguration()); for (InputSplit genericSplit : splits) { // load the index FileSplit fileSplit = (FileSplit) genericSplit; Path file = fileSplit.getPath(); LzoIndex index = indexes.get(file); if (index == null) { throw new IOException("Index not found for " + file); }//from ww w .jav a 2 s. c om if (index.isEmpty()) { // empty index, keep as is result.add(fileSplit); continue; } long start = fileSplit.getStart(); long end = start + fileSplit.getLength(); if (start != 0) { // find the next block position from // the start of the split long newStart = index.findNextPosition(start); if (newStart == -1 || newStart >= end) { // just skip this since it will be handled by another split continue; } start = newStart; } long newEnd = index.findNextPosition(end); if (newEnd != -1) { end = newEnd; } else { //didn't find the next position //we have hit the end of the file end = fs.getFileStatus(file).getLen(); } result.add(new FileSplit(file, start, end - start, fileSplit.getLocations())); } return result; }
From source file:com.marklogic.contentpump.CombineDocumentSplit.java
License:Apache License
public void addSplit(FileSplit split) throws IOException, InterruptedException { splits.add(split);//from w w w. j a v a 2 s . c o m length += split.getLength(); for (String loc : split.getLocations()) { if (!locations.contains(loc)) { locations.add(loc); } } }
From source file:com.marklogic.contentpump.DelimitedTextInputFormat.java
License:Apache License
public List<InputSplit> getSplits(JobContext job) throws IOException { boolean delimSplit = isSplitInput(job.getConfiguration()); //if delimSplit is true, size of each split is determined by //Math.max(minSize, Math.min(maxSize, blockSize)) in FileInputFormat List<InputSplit> splits = super.getSplits(job); if (!delimSplit) { return splits; }// w w w . j av a 2 s .c o m if (splits.size() >= SPLIT_COUNT_LIMIT) { //if #splits > 1 million, there is enough parallelism //therefore no point to split LOG.warn("Exceeding SPLIT_COUNT_LIMIT, input_split is off:" + SPLIT_COUNT_LIMIT); DefaultStringifier.store(job.getConfiguration(), false, ConfigConstants.CONF_SPLIT_INPUT); return splits; } // add header info into splits List<InputSplit> populatedSplits = new ArrayList<InputSplit>(); LOG.info(splits.size() + " DelimitedSplits generated"); Configuration conf = job.getConfiguration(); char delimiter = 0; ArrayList<Text> hlist = new ArrayList<Text>(); for (InputSplit file : splits) { FileSplit fsplit = ((FileSplit) file); Path path = fsplit.getPath(); FileSystem fs = path.getFileSystem(conf); if (fsplit.getStart() == 0) { // parse the inSplit, get the header FSDataInputStream fileIn = fs.open(path); String delimStr = conf.get(ConfigConstants.CONF_DELIMITER, ConfigConstants.DEFAULT_DELIMITER); if (delimStr.length() == 1) { delimiter = delimStr.charAt(0); } else { LOG.error("Incorrect delimitor: " + delimiter + ". Expects single character."); } String encoding = conf.get(MarkLogicConstants.OUTPUT_CONTENT_ENCODING, MarkLogicConstants.DEFAULT_OUTPUT_CONTENT_ENCODING); InputStreamReader instream = new InputStreamReader(fileIn, encoding); CSVParser parser = new CSVParser(instream, CSVParserFormatter.getFormat(delimiter, DelimitedTextReader.encapsulator, true, true)); Iterator<CSVRecord> it = parser.iterator(); String[] header = null; if (it.hasNext()) { CSVRecord record = (CSVRecord) it.next(); Iterator<String> recordIterator = record.iterator(); int recordSize = record.size(); header = new String[recordSize]; for (int i = 0; i < recordSize; i++) { if (recordIterator.hasNext()) { header[i] = (String) recordIterator.next(); } else { throw new IOException("Record size doesn't match the real size"); } } EncodingUtil.handleBOMUTF8(header, 0); hlist.clear(); for (String s : header) { hlist.add(new Text(s)); } } instream.close(); } DelimitedSplit ds = new DelimitedSplit(new TextArrayWritable(hlist.toArray(new Text[hlist.size()])), path, fsplit.getStart(), fsplit.getLength(), fsplit.getLocations()); populatedSplits.add(ds); } return populatedSplits; }
From source file:com.scaleoutsoftware.soss.hserver.FileImage.java
License:Apache License
/** * Calculates additional splits in case files were modified since the image was last recorded. It relies * on the fact that HDFS only allows appending the files. The {@link InputFormat#getSplits(org.apache.hadoop.mapreduce.JobContext)} * is called to get the split list, and then already recorded splits are remowed from that list. In case of partially recorded split, * this split is truncated, so it contains only appended part. * * @param context job context//from ww w.ja va2s . c o m * @param format input format */ @SuppressWarnings("unchecked") void addNewSplits(JobContext context, InputFormat format) throws IOException, InterruptedException { List<InputSplit> newSplits = format.getSplits(context); for (InputSplit inputSplit : newSplits) { FileSplit split = (FileSplit) inputSplit; String path = split.getPath().toString(); if (!filesPaths.contains(path)) throw new IOException("No such file in the recorded image."); long currentFileLength = lengths.get(path); long begin = split.getStart(); if (begin >= currentFileLength) //New split is entirely in the appended part of the file { splits.add(new ImageInputSplit(inputSplit, getImageIdString(), creationTimestamp, splits.size())); continue; } long end = begin + split.getLength(); if (end <= currentFileLength) continue; //New split is entirely in the recorded area, we don't need it FileSplit additionalSplit = new FileSplit(split.getPath(), currentFileLength, end - currentFileLength, split.getLocations()); splits.add(new ImageInputSplit(additionalSplit, getImageIdString(), creationTimestamp, splits.size())); } List<FileStatus> files = getFiles(context, (FileInputFormat) format); for (FileStatus file : files) { String filePath = file.getPath().toString(); modificationDate.put(filePath, file.getModificationTime()); lengths.put(filePath, file.getLen()); } }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * Go through each original inputsplit, get its file path, and check the * index file,// w w w . ja v a 2 s . c om * a) keep it, when there is no index prebuilt on this file * (or the index file doesn't match with the base file's checksum; * b) remove it when no matching value is found in existing index file; * c) construct new smaller inputsplits using indexed blocks found * in the index file; */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { String inputformat = job.getConfiguration().get(REALINPUTFORMAT); String valueClass = job.getConfiguration().get(VALUECLASS); List<InputSplit> filteredList = new ArrayList<InputSplit>(); FileInputFormat<K, V> realInputFormat = getInputFormatClass(inputformat, valueClass); List<InputSplit> splits = realInputFormat.getSplits(job); //if indexing jobs, don't skip any input splits. //if searching job but no searching filter, skip the index as well. if (isIndexingJob(job) || getFilterCondition(job) == null) return splits; Path prevFile = null; // remember the last input file we saw boolean foundIndexedFile = false; // is there a index file for // prevFile? boolean firstTime = true; // is this the first time we see this file? long totalOriginalBytes = 0; //the bytes to be scanned without indexes. totalBytesNewSplits = 0; long startTime = System.currentTimeMillis(); LOG.info("start filtering out original input splits (total " + splits.size() + ") using indexes"); Configuration conf = job.getConfiguration(); long splitMaxSize; // for each original input split check if we can filter it out. for (InputSplit split : splits) { FileSplit fileSplit = (FileSplit) split; Path path = fileSplit.getPath(); splitLength = fileSplit.getLength(); totalOriginalBytes += fileSplit.getLength(); splitMaxSize = Math.max(splitLength, conf.getInt(INDEXED_SPLIT_SIZE, conf.getInt("dfs.block.size", 256 * 1024 * 1024))); /* * for each new file we see, we first check if it has been indexed or not; * if not, we just add the original input split; if yes, we use the index * file to add filtered splits for the file */ if (prevFile != null && path.equals(prevFile)) { firstTime = false; } else { prevFile = path; firstTime = true; foundIndexedFile = foundIndexFile(job, path); } // if no index file, we'll have to read all original input // splits if (!foundIndexedFile) filteredList.add(fileSplit); else { // for each file we only add once its filtered input splits using index // file if (firstTime) { // LOG.info("first time saw " + path // + ", adding filtered splits from index file"); filteredList.addAll(getFilteredSplits(job, path, fileSplit.getLocations(), splitMaxSize)); } } } long endTime = System.currentTimeMillis(); LOG.info("finished filtering out input splits, now total splits:" + filteredList.size() + ", seconds used: " + (endTime - startTime) / 1000); LOG.info(String.format("total bytes to read before filtering: %s," + " after filtering %s, bytes ratio: %s", totalOriginalBytes, totalBytesNewSplits, totalOriginalBytes / Math.max(1, totalBytesNewSplits))); return filteredList; }
From source file:com.twitter.elephanttwin.retrieval.OneSplitInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { Configuration conf = job.getConfiguration(); FileSplit split = (FileSplit) super.getSplits(job).get(0); List<InputSplit> lists = new ArrayList<InputSplit>(); lists.add(new FileSplit(split.getPath(), conf.getLong(START, 0), conf.getLong(END, 0) - conf.getLong(START, 0), split.getLocations())); return lists; }
From source file:edu.umn.cs.spatialHadoop.mapreduce.SpatialInputFormat3.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); Configuration jobConf = job.getConfiguration(); if (jobConf.getInt(CombineSplits, 1) > 1) { long t1 = System.currentTimeMillis(); int combine = jobConf.getInt(CombineSplits, 1); /*/*from ww w. j av a 2 s.c o m*/ * Combine splits to reduce number of map tasks. Currently, this is done * using a greedy algorithm that combines splits based on how many hosts * they share. * TODO: Use a graph clustering algorithm where each vertex represents a * split, and each edge is weighted with number of shared hosts between * the two splits */ Vector<Vector<FileSplit>> openSplits = new Vector<Vector<FileSplit>>(); int maxNumberOfSplits = (int) Math.ceil((float) splits.size() / combine); List<InputSplit> combinedSplits = new Vector<InputSplit>(); for (InputSplit split : splits) { FileSplit fsplit = (FileSplit) split; int maxSimilarity = -1; // Best similarity found so far int bestFit = -1; // Index of a random open split with max similarity int numMatches = 0; // Number of splits with max similarity for (int i = 0; i < openSplits.size(); i++) { Vector<FileSplit> splitList = openSplits.elementAt(i); int similarity = 0; for (FileSplit otherSplit : splitList) { for (String host1 : fsplit.getLocations()) for (String host2 : otherSplit.getLocations()) if (host1.equals(host2)) similarity++; } if (similarity > maxSimilarity) { maxSimilarity = similarity; bestFit = i; numMatches = 1; } else if (similarity == maxSimilarity) { numMatches++; // Replace with a probability () for a reservoir sample double random = Math.random(); if (random < (double) 1 / numMatches) { // Replace the element in the reservoir bestFit = i; } } } if (maxSimilarity > 0 || (openSplits.size() + combinedSplits.size()) >= maxNumberOfSplits) { // Good fit || cannot create more open splits, // add it to an existing open split. Vector<FileSplit> bestList = openSplits.elementAt(bestFit); bestList.add(fsplit); if (bestList.size() > combine) { // Reached threshold for this list. Add it to combined splits combinedSplits.add(FileSplitUtil.combineFileSplits(bestList, 0, bestList.size())); // Remove it from open splits openSplits.remove(bestFit); } } else { // Bad fit && can add a new split // Create a new open split just for this one Vector<FileSplit> newOpenSplit = new Vector<FileSplit>(); newOpenSplit.add(fsplit); openSplits.addElement(newOpenSplit); } } // Add all remaining open splits to the list of combined splits for (Vector<FileSplit> openSplit : openSplits) { combinedSplits.add(FileSplitUtil.combineFileSplits(openSplit, 0, openSplit.size())); } String msg = String.format("Combined %d splits into %d combined splits", splits.size(), combinedSplits.size()); splits.clear(); splits.addAll(combinedSplits); long t2 = System.currentTimeMillis(); LOG.info(msg + " in " + ((t2 - t1) / 1000.0) + " seconds"); } return splits; }