List of usage examples for org.apache.hadoop.fs FileSystem getFileBlockLocations
public BlockLocation[] getFileBlockLocations(Path p, long start, long len) throws IOException
From source file:eu.scape_project.pt.mapred.input.ControlFileInputFormat.java
License:Apache License
/** * Gets block locations of input files sorted * by the total number of occurrences.//from w ww .j a v a2 s. c o m * * @param fs Hadoop filesystem handle * @param inFiles array of input files * @return sorted String array */ public static String[] getSortedHosts(FileSystem fs, Path[] inFiles) throws IOException { final Map<String, Integer> hostMap = new HashMap<String, Integer>(); for (Path inFile : inFiles) { FileStatus s = fs.getFileStatus(inFile); BlockLocation[] locations = fs.getFileBlockLocations(s, 0, s.getLen()); for (BlockLocation location : locations) { String[] hosts = location.getHosts(); for (String host : hosts) { if (!hostMap.containsKey(host)) { hostMap.put(host, 1); continue; } hostMap.put(host, hostMap.get(host) + 1); } } } // sort hosts by number of references to blocks of input files List<String> hosts = new ArrayList<String>(); hosts.addAll(hostMap.keySet()); Collections.sort(hosts, new Comparator<String>() { @Override public int compare(String host1, String host2) { return hostMap.get(host2) - hostMap.get(host1); } }); return hosts.toArray(new String[0]); }
From source file:gr.ntua.h2rdf.inputFormat.MyFileInputFormat.java
License:Open Source License
/** * Generate the list of files and make them into FileSplits. *//*w ww . j a va2 s . c o m*/ public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts())); } else { //Create empty hosts array for zero length files splits.add(new MyFileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); String p = job.getConfiguration().get("mapred.fairscheduler.pool"); int max = Integer.parseInt(p.substring(p.indexOf("l") + 1)); if (splits.size() <= max) job.getConfiguration().setInt("mapred.reduce.tasks", splits.size()); else job.getConfiguration().setInt("mapred.reduce.tasks", max); return splits; }
From source file:hsyndicate.tools.BlockLocations.java
License:Apache License
public static void main(String[] args) throws Exception { Path p = new Path(args[0]); Configuration conf = new Configuration(); FileSystem fs = p.getFileSystem(conf); FileStatus f = fs.getFileStatus(p);/*ww w. j av a2 s .c o m*/ BlockLocation[] bla = fs.getFileBlockLocations(f, 0, f.getLen()); System.out.println("File : " + f.getPath().toString()); for (BlockLocation bl : bla) { System.out.println("Offset : " + bl.getOffset()); System.out.println("Len : " + bl.getLength()); System.out.println("Hosts : " + makeCommaSeparated(bl.getHosts())); System.out.println("Names : " + makeCommaSeparated(bl.getNames())); System.out.println("TopologyPaths : " + makeCommaSeparated(bl.getTopologyPaths())); } }
From source file:IndexService.IColumnInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path tmpPath = null;//from w w w . j a v a2 s.c om FileSystem fs = FileSystem.get(job); List<IColumnInputSplit> splits = new ArrayList<IColumnInputSplit>(); HashMap<String, FileStatus> files = new HashMap<String, FileStatus>(); String[] inputfiles = job.getStrings("mapred.input.dir"); for (String file : inputfiles) { FileStatus[] fss = fs.globStatus(new Path(file + "_idx*")); FileStatus status = null; long length = 0; for (FileStatus ss : fss) { if (ss.getLen() > length) { length = ss.getLen(); status = ss; } } files.put(file, status); } for (String filekey : files.keySet()) { FileStatus file = files.get(filekey); Path path = file.getPath(); Path keypath = new Path(filekey); long length = file.getLen(); tmpPath = keypath; BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (blkLocations.length <= 1) { IColumnInputSplit split = new IColumnInputSplit(keypath, length, blkLocations[0].getHosts()); splits.add(split); } else { String filename = path.toString(); IFormatDataFile ifd = new IFormatDataFile(job); ifd.open(filename); ISegmentIndex segmentIndex = ifd.segIndex(); for (int i = 0; i < segmentIndex.getSegnum(); i++) { IColumnInputSplit split = new IColumnInputSplit(keypath, segmentIndex.getseglen(i), segmentIndex.getILineIndex(i).beginline(), segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1, blkLocations[i].getHosts()); splits.add(split); } ifd.close(); } } if (splits.size() == 0) { splits.add(new IColumnInputSplit(tmpPath, 0, 0, 0, new String[0])); } System.out.println("Total # of splits: " + splits.size()); return splits.toArray(new IColumnInputSplit[splits.size()]); }
From source file:IndexService.IFormatInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path tmpPath = null;//from w w w. j ava 2 s .c om List<IFormatInputSplit> splits = new ArrayList<IFormatInputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (blkLocations.length <= 1) { IFormatInputSplit split = new IFormatInputSplit(path, length, blkLocations[0].getHosts()); splits.add(split); } else { String filename = path.toString(); IFormatDataFile ifd = new IFormatDataFile(job); ifd.open(filename); ISegmentIndex segmentIndex = ifd.segIndex(); for (int i = 0; i < segmentIndex.getSegnum(); i++) { IFormatInputSplit split = new IFormatInputSplit(path, segmentIndex.getseglen(i), segmentIndex.getILineIndex(i).beginline(), segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1, blkLocations[i].getHosts()); splits.add(split); } ifd.close(); } tmpPath = path; } if (splits.size() == 0) { splits.add(new IFormatInputSplit(tmpPath, 0, 0, 0, new String[0])); return splits.toArray(new IFormatInputSplit[splits.size()]); } LOG.info("Total # of splits: " + splits.size()); return splits.toArray(new IFormatInputSplit[splits.size()]); }
From source file:IndexService.IndexMergeIFormatInputFormat.java
License:Open Source License
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Path tmpPath = null;//from w w w . j a v a 2 s. c o m List<IndexMergeIFormatSplit> splits = new ArrayList<IndexMergeIFormatSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if (blkLocations.length <= 1) { IndexMergeIFormatSplit split = new IndexMergeIFormatSplit(path, length, blkLocations[0].getHosts()); splits.add(split); } else { String filename = path.toString(); IFormatDataFile ifd = new IFormatDataFile(job); ifd.open(filename); ISegmentIndex segmentIndex = ifd.segIndex(); for (int i = 0; i < segmentIndex.getSegnum(); i++) { IndexMergeIFormatSplit split = new IndexMergeIFormatSplit(path, segmentIndex.getseglen(i), segmentIndex.getILineIndex(i).beginline(), segmentIndex.getILineIndex(i).endline() - segmentIndex.getILineIndex(i).beginline() + 1, blkLocations[i].getHosts()); splits.add(split); } ifd.close(); } tmpPath = path; } if (splits.size() == 0) { splits.add(new IndexMergeIFormatSplit(tmpPath, 0, null, 0, null)); return splits.toArray(new IndexMergeIFormatSplit[splits.size()]); } System.out.println("Total # of splits: " + splits.size()); return splits.toArray(new IndexMergeIFormatSplit[splits.size()]); }
From source file:input_format.MyFileInputFormat.java
License:Open Source License
/** * Generate the list of files and make them into FileSplits. *//* w w w .ja v a2 s. c o m*/ public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); for (FileStatus file : listStatus(job)) { Path path = file.getPath(); FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add(new MyFileSplit(path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add(new MyFileSplit(path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new MyFileSplit(path, 0, length, blkLocations[0].getHosts())); } else { //Create empty hosts array for zero length files splits.add(new MyFileSplit(path, 0, length, new String[0])); } } LOG.debug("Total # of splits: " + splits.size()); //String p=job.getConfiguration().get("mapred.fairscheduler.pool"); //int max = Integer.parseInt(p.substring(p.indexOf("l")+1)); int max = 1000; if (splits.size() <= max) job.getConfiguration().setInt("mapred.reduce.tasks", splits.size()); else job.getConfiguration().setInt("mapred.reduce.tasks", max); return splits; }
From source file:io.hops.erasure_coding.BaseEncodingManager.java
License:Apache License
/** * RAID an individual file/* ww w. j ava 2 s.c o m*/ */ public static boolean doFileRaid(Configuration conf, Path sourceFile, Path parityPath, Codec codec, Statistics statistics, Progressable reporter, int targetRepl, int metaRepl) throws IOException { FileSystem srcFs = sourceFile.getFileSystem(conf); FileStatus sourceStatus = srcFs.getFileStatus(sourceFile); // extract block locations from File system BlockLocation[] locations = srcFs.getFileBlockLocations(sourceFile, 0, sourceStatus.getLen()); // if the file has fewer than 2 blocks, then nothing to do if (locations.length <= 2) { return false; } // add up the raw disk space occupied by this file long diskSpace = 0; for (BlockLocation l : locations) { diskSpace += (l.getLength() * sourceStatus.getReplication()); } statistics.numProcessedBlocks += locations.length; statistics.processedSize += diskSpace; // generate parity file generateParityFile(conf, sourceStatus, targetRepl, reporter, srcFs, parityPath, codec, locations.length, sourceStatus.getReplication(), metaRepl, sourceStatus.getBlockSize()); if (srcFs.setReplication(sourceFile, (short) targetRepl) == false) { LOG.info("Error in reducing replication of " + sourceFile + " to " + targetRepl); statistics.remainingSize += diskSpace; return false; } ; diskSpace = 0; for (BlockLocation l : locations) { diskSpace += (l.getLength() * targetRepl); } statistics.remainingSize += diskSpace; // the metafile will have this many number of blocks int numMeta = locations.length / codec.stripeLength; if (locations.length % codec.stripeLength != 0) { numMeta++; } // we create numMeta for every file. This metablock has metaRepl # replicas. // the last block of the metafile might not be completely filled up, but we // ignore that for now. statistics.numMetaBlocks += (numMeta * metaRepl); statistics.metaSize += (numMeta * metaRepl * sourceStatus.getBlockSize()); return true; }
From source file:it.prz.jmatrw4spark.JMATFileInputFormat.java
License:Open Source License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); //It generates the splits. List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> files = listStatus(job); for (FileStatus file : files) { Path filePath = file.getPath(); //Calculates the content (array of double) length in bytes. FileSystem fs = filePath.getFileSystem(job.getConfiguration()); FSDataInputStream dis = fs.open(filePath); JMATReader _matReader = new JMATReader(dis); JMATInfo _matdata = _matReader.getInfo(); long length = _matdata.dataNumOfItems * MLDataType.miDOUBLE.bytes; //Content length. long lContentByteOffset = dis.getPos(); _matReader.close();/*from w ww. ja v a2 s . c o m*/ _matReader = null; dis = null; //Zero bytes, empty file split. if (length <= 0) { //Create empty hosts array for zero length files splits.add(makeSplit(filePath, 0, length, new String[0])); } //Split the data. if (length > 0) { BlockLocation[] blkLocations; if (file instanceof LocatedFileStatus) { blkLocations = ((LocatedFileStatus) file).getBlockLocations(); } else { blkLocations = fs.getFileBlockLocations(file, lContentByteOffset, length); } boolean isSplittable = isSplitable(job, filePath); LOG.debug("Current file to process " + filePath.getName() + ". Splittable? " + isSplittable); if (isSplittable) { long blockSize = file.getBlockSize(); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (((double) bytesRemaining) / splitSize > SPLIT_SLOP) { long lBlockByteStart = lContentByteOffset + length - bytesRemaining; int blkIndex = getBlockIndex(blkLocations, lBlockByteStart); splits.add( makeSplit(filePath, lBlockByteStart, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } //EndWhile. if (bytesRemaining != 0) { long lBlockByteStart = lContentByteOffset + length - bytesRemaining; int blkIndex = getBlockIndex(blkLocations, lBlockByteStart); splits.add(makeSplit(filePath, lBlockByteStart, bytesRemaining, blkLocations[blkIndex].getHosts())); } } else { // not splitable splits.add(makeSplit(filePath, lContentByteOffset, length, blkLocations[0].getHosts())); } } } //EndFor. // Save the number of input files for metrics/loadgen job.getConfiguration().setLong(NUM_INPUT_FILES, files.size()); LOG.debug("Total # of splits: " + splits.size()); return splits; }
From source file:kogiri.mapreduce.preprocess.common.kmerindex.KmerIndexSplit.java
License:Open Source License
private String[] findBestLocations(Path[] indexFilePaths, Configuration conf) throws IOException { Hashtable<String, MutableInteger> blkLocations = new Hashtable<String, MutableInteger>(); for (Path path : indexFilePaths) { FileSystem fs = path.getFileSystem(conf); FileStatus fileStatus = fs.getFileStatus(path); BlockLocation[] fileBlockLocations = fs.getFileBlockLocations(fileStatus, 0, fileStatus.getLen()); for (BlockLocation location : fileBlockLocations) { for (String host : location.getHosts()) { MutableInteger cnt = blkLocations.get(host); if (cnt == null) { blkLocations.put(host, new MutableInteger(1)); } else { cnt.increase();// ww w .ja v a 2s .c om } } } } List<String> blkLocationsArr = new ArrayList<String>(); for (String key : blkLocations.keySet()) { blkLocationsArr.add(key); } if (blkLocationsArr.size() == 0) { return new String[] { "localhost" }; } else { return blkLocationsArr.toArray(new String[0]); } }