List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.twitter.algebra.TransposeJob.java
License:Apache License
/** * Perform transpose of A, where A is already wrapped in a DistributedRowMatrix * object. /*from w w w . jav a 2 s. co m*/ * * @param distM * input matrix A * @param conf * the initial configuration * @param label * the label for the output directory * @return At wrapped in a DistributedRowMatrix object * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public static DistributedRowMatrix transpose(DistributedRowMatrix distM, Configuration conf, String label) throws IOException, InterruptedException, ClassNotFoundException { Path outputPath = new Path(distM.getOutputTempPath(), label); FileSystem fs = FileSystem.get(outputPath.toUri(), conf); TransposeJob job = new TransposeJob(); if (!fs.exists(outputPath)) { job.run(conf, distM.getRowPath(), outputPath, distM.numRows(), distM.numCols()); } else { log.warn("----------- Skip already exists: " + outputPath); } DistributedRowMatrix m = new DistributedRowMatrix(outputPath, distM.getOutputTempPath(), distM.numCols(), distM.numRows()); m.setConf(conf); return m; }
From source file:com.twitter.algebra.TransposeJob.java
License:Apache License
/** * Perform transpose of A, where A refers to the path that contains a matrix * in {@link SequenceFileInputFormat}.//from ww w . j a va 2 s.c o m * * @param conf * the initial configuration * @param matrixInputPath * the path to the input files that we process * @param matrixOutputPath * the path of the resulting transpose matrix * @param numInputRows * rows * @param numInputCols * cols * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols) throws IOException, InterruptedException, ClassNotFoundException { conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(TransposeJob.class); job.setJobName(TransposeJob.class.getSimpleName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(TransposeMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose"); job.setNumReduceTasks(numReducers); // job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols); job.setCombinerClass(MergeVectorsCombiner.class); job.setReducerClass(MergeVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param context//w ww . j a v a2 s .com * @param file * an input file to work on provided to the job * @return true if there is a index file for the input file * @throws IOException */ public static boolean foundIndexFile(JobContext context, Path file) throws IOException { Configuration conf = context.getConfiguration(); FileSystem fs = file.getFileSystem(conf); Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + BlockIndexedFileInputFormat.INDEXMETAFILENAME); if (!fs.exists(indexFilePath)) { LOG.info("no index file found for input file:" + file + " at location " + indexFilePath); return false; } FSDataInputStream in = fs.open(indexFilePath); ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class); writable.readFields(in); FileIndexDescriptor indexDescriptor = writable.get(); in.close(); return verifyInputFileCheckSum(indexDescriptor, context); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file, BinaryExpression filterCondition, long splitMaxSize) throws IOException { Expression lhs = filterCondition.getLhs(); Expression rhs = filterCondition.getRhs(); if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node" // handle cases like 'abcd' == column , column == 'abcd' if (rhs instanceof Column && lhs instanceof Const) { lhs = filterCondition.getRhs(); rhs = filterCondition.getLhs(); }/*from w w w .j ava 2s .c o m*/ String columnName = ((Column) lhs).getName(); String value = ((String) ((Const) rhs).getValue()); Text searchedValue = new Text(value); FileStatus[] dirlist = listIndexFiles(context, file, columnName); int part_num = dirlist.length; int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue, part_num); String part_name = "/part-r-" + String.format("%05d", part_seqnum); FileSystem fs = file.getFileSystem(context.getConfiguration()); MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs, getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name, context.getConfiguration()); ListLongPair indexedBlocks = new ListLongPair(); mapFileIndexReader.get(searchedValue, indexedBlocks); mapFileIndexReader.close(); return indexedBlocks.get(); } List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs, splitMaxSize); List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs, splitMaxSize); if (filterCondition.getOpType() == OpType.OP_AND) return andFilter(blocksLeft, blocksRight); else if (filterCondition.getOpType() == OpType.OP_OR) { return orFilter(blocksLeft, blocksRight, splitMaxSize); } else throw new IOException("not supported filter condition:" + filterCondition); }
From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java
License:Apache License
/** * @param context/* ww w . j a v a 2 s . c o m*/ * @param file * the input file provided to the job to work on * @param columnName * @return the list of index files if there is an index directory created for * the input file * @throws IOException */ protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName) throws IOException { Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName); FileSystem fs = file.getFileSystem(context.getConfiguration()); FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter); return dirlist; }
From source file:com.twitter.elephanttwin.util.HdfsUtils.java
License:Apache License
/** * Read UTF-8 lines of text directly from HDFS. *//*ww w .ja v a 2 s .c om*/ public static Collection<String> readLines(FileSystem fs, Path path) throws IOException { Preconditions.checkNotNull(fs); Preconditions.checkNotNull(path); LOG.info("Reading from " + path.toUri()); List<String> lines = Lists.newArrayList(); try { if (!fs.exists(path)) { throw new IOException("File not found at " + path); } // TODO(Jimmy Lin): return CharStreams.readLines(new InputStreamReader(fs.open(path), "UTF-8")) // Note that this basically dups the functionality of HdfsFileTransfer. BufferedReader stream = new BufferedReader(new InputStreamReader(fs.open(path), "UTF-8")); String line; while ((line = stream.readLine()) != null) { lines.add(line); } LOG.info("Read " + lines.size() + " queries from " + path.toUri()); return lines; } catch (IOException e) { LOG.warning("Failed to read " + path.toUri() + ": " + e.toString()); throw e; } }
From source file:com.twitter.hraven.etl.FileLister.java
License:Apache License
/** * looks at the src path and fetches the list of files to process * confirms that the size of files//from ww w.ja va 2s. c o m * is less than the maxFileSize * hbase cell can't store files bigger than maxFileSize, * hence no need to consider them for rawloading * Reference: {@link https://github.com/twitter/hraven/issues/59} * @param maxFileSize - max #bytes to be stored in an hbase cell * @param recurse - whether to recurse or not * @param hdfs - filesystem to be looked at * @param inputPath - root dir of the path containing history files * @param jobFileModifiedRangePathFilter - to filter out files * @return - array of FileStatus of files to be processed * @throws IOException */ public static FileStatus[] getListFilesToProcess(long maxFileSize, boolean recurse, FileSystem hdfs, Path inputPath, JobFileModifiedRangePathFilter pathFilter) throws IOException { LOG.info(" in getListFilesToProcess maxFileSize=" + maxFileSize + " inputPath= " + inputPath.toUri()); FileStatus[] origList = listFiles(recurse, hdfs, inputPath, pathFilter); if (origList == null) { LOG.info(" No files found, orig list returning 0"); return new FileStatus[0]; } return pruneFileListBySize(maxFileSize, origList, hdfs, inputPath); }
From source file:com.twitter.hraven.etl.FileLister.java
License:Apache License
/** * prunes the given list/array of files based on their sizes * * @param maxFileSize -max #bytes to be stored in an hbase cell * @param origList - input list of files to be processed * @param hdfs - filesystem to be looked at * @param inputPath - root dir of the path containing history files * @return - pruned array of FileStatus of files to be processed *///from w w w . j ava2s .co m static FileStatus[] pruneFileListBySize(long maxFileSize, FileStatus[] origList, FileSystem hdfs, Path inputPath) { LOG.info("Pruning orig list of size " + origList.length + " for source" + inputPath.toUri()); long fileSize = 0L; List<FileStatus> prunedFileList = new ArrayList<FileStatus>(); Set<String> toBeRemovedJobId = new HashSet<String>(); for (int i = 0; i < origList.length; i++) { fileSize = origList[i].getLen(); // check if hbase can store this file if yes, consider it for processing if (fileSize <= maxFileSize) { prunedFileList.add(origList[i]); } else { Path hugeFile = origList[i].getPath(); LOG.info("In getListFilesToProcess filesize " + fileSize + " has exceeded maxFileSize " + maxFileSize + " for " + hugeFile.toUri()); // note the job id so that we can remove the other file (job conf or job history) toBeRemovedJobId.add(getJobIdFromPath(hugeFile)); } } if (prunedFileList.size() == 0) { LOG.info("Found no files worth processing. Returning 0 sized array"); return new FileStatus[0]; } String jobId = null; ListIterator<FileStatus> it = prunedFileList.listIterator(); while (it.hasNext()) { if (toBeRemovedJobId.size() == 0) { // no files to remove break; } Path curFile = it.next().getPath(); jobId = getJobIdFromPath(curFile); if (toBeRemovedJobId.contains(jobId)) { LOG.info("Removing from prunedList " + curFile.toUri()); it.remove(); /* * removing the job id from the hash set since there would be only * one file with this job id in the prunedList, the other file with * this job id was huge and was already moved out */ toBeRemovedJobId.remove(jobId); } } return prunedFileList.toArray(new FileStatus[prunedFileList.size()]); }
From source file:com.twitter.hraven.etl.FileLister.java
License:Apache License
/** * extracts the job id from a Path//from w w w. j a v a 2 s . c o m * @param input Path * @return job id as string */ static String getJobIdFromPath(Path aPath) { String fileName = aPath.getName(); JobFile jf = new JobFile(fileName); String jobId = jf.getJobid(); if (jobId == null) { throw new ProcessingException("job id is null for " + aPath.toUri()); } return jobId; }
From source file:com.twitter.hraven.etl.JobFilePartitioner.java
License:Apache License
@Override public int run(String[] args) throws Exception { myConf = getConf();// ww w . j ava 2s . c o m // Presume this is all HDFS paths, even when access as file:// hdfs = FileSystem.get(myConf); // Grab input args and allow for -Dxyz style arguments String[] otherArgs = new GenericOptionsParser(myConf, args).getRemainingArgs(); // Grab the arguments we're looking for. CommandLine commandLine = parseArgs(otherArgs); // Grab the input path argument input = commandLine.getOptionValue("i"); LOG.info("input=" + input); // Grab the input path argument String output = commandLine.getOptionValue("o"); LOG.info("output=" + output); skipExisting = commandLine.hasOption("s"); LOG.info("skipExisting=" + skipExisting); moveFiles = commandLine.hasOption("m"); LOG.info("moveFiles=" + moveFiles); if (skipExisting && moveFiles) { throw new IllegalArgumentException("Cannot use both options skipExisting and move simultaneously."); } if (commandLine.hasOption("x")) { try { maXretention = Integer.parseInt(commandLine.getOptionValue("x")); } catch (NumberFormatException nfe) { throw new IllegalArgumentException( "maXretention option -x is is not a valid number: " + commandLine.getOptionValue("x"), nfe); } // Additional check if (maXretention < 0) { throw new IllegalArgumentException( "Cannot retain less than 0 files. Specified maXretention option -x is: " + commandLine.getOptionValue("x")); } LOG.info("maXretention=" + maXretention); if (moveFiles) { throw new IllegalArgumentException("Cannot use both options maXretention and move simultaneously."); } } else { maXretention = Integer.MAX_VALUE; } outputPath = new Path(output); FileStatus outputFileStatus = hdfs.getFileStatus(outputPath); if (!outputFileStatus.isDir()) { throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName()); } Path inputPath = new Path(input); URI inputURI = inputPath.toUri(); String inputScheme = inputURI.getScheme(); LOG.info("input scheme is: " + inputScheme); // If input directory is HDFS, then process as such. Assume not scheme is // HDFS if ((inputScheme == null) || (hdfs.getUri().getScheme().equals(inputScheme))) { processHDFSSources(inputPath); } else if (inputScheme.equals("file")) { if (moveFiles) { throw new IllegalArgumentException( "Cannot move files that are not already in hdfs. Input is not HDFS: " + input); } processPlainFileSources(inputURI); } else { throw new IllegalArgumentException("Cannot process files from this URI scheme: " + inputScheme); } Statistics statistics = FileSystem.getStatistics(outputPath.toUri().getScheme(), hdfs.getClass()); if (statistics != null) { LOG.info("HDFS bytes read: " + statistics.getBytesRead()); LOG.info("HDFS bytes written: " + statistics.getBytesWritten()); LOG.info("HDFS read ops: " + statistics.getReadOps()); System.out.println("HDFS large read ops: " + statistics.getLargeReadOps()); LOG.info("HDFS write ops: " + statistics.getWriteOps()); } return 0; }