List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.tripadvisor.hadoop.VerifyHdfsBackup.java
License:Apache License
/** * Method to go though the HDFS filesystem in a DFS to find all * files/*from w w w . j a va 2 s.c o m*/ * * fs:FileSystem object from HDFS * maxDate:Newest date for files to be backed up * p:Path in HDFS to look for files **/ public void checkDir(FileSystem fs, Path p, String sLocalPathRoot, long maxDate) { FileStatus[] fStat; try { String sPath = p.toUri().getPath(); // If this is a directory if (fs.getFileStatus(p).isDir()) { // ignore certain directories if ("dfstmp".equals(p.getName()) || "tmp".equals(p.getName()) || "jobtracker".equals(p.getName()) || sPath.startsWith("/mapred") || "ops".equals(p.getName()) || p.getName().startsWith("_distcp_logs")) { return; } fStat = fs.listStatus(p); // Do a recursive call to all elements for (int i = 0; i < fStat.length; i++) { checkDir(fs, fStat[i].getPath(), sLocalPathRoot, maxDate); } } else { // If not a directory then we've found a file // ignore crc files if (p.getName().endsWith(".crc")) { return; } // ignore other files if (sPath.startsWith("/user/oozie/etl/workflows/")) { return; } // try to get the table name from the path. There are // various types of tables, from those replicated from // tripmonster to regular hive tables to partitioned // hive tables. We use table names to both exclude // some from the backup, and for the rest to dump out // the schema and partition name. if (m_ignoreTables != null && m_ignoreTables.doIgnoreFile(sPath)) { return; } // check the file FileStatus stat = fs.getFileStatus(p); // ignore files that are too new if ((stat.getModificationTime() / 1000) > maxDate) { System.out.println("IGNORING: " + sPath + " too new"); return; } // warn about files that have a mis-matching block // size. The checksum check will fail for them // anyways, so just catch it here. if (stat.getBlockSize() != N_BLOCK_SIZE) { System.out.println("ERROR: non-default block size (" + (stat.getBlockSize() / (1024 * 1024)) + "M) would fail checksum: " + sPath); return; } // get HDFS checksum FileChecksum ck = fs.getFileChecksum(p); String sCk, sCkShort; if (ck == null) { sCk = sCkShort = "<null>"; } else { sCk = ck.toString(); sCkShort = sCk.replaceAll("^.*:", ""); } System.out.println(sPath + " len=" + stat.getLen() + " " + stat.getOwner() + "/" + stat.getGroup() + " checksum=" + sCk); // find the local file String sFsPath = sLocalPathRoot + p.toUri().getPath(); File fLocal = new File(sFsPath); if (!fLocal.exists()) { Calendar cal = Calendar.getInstance(); cal.setTimeInMillis(stat.getModificationTime()); System.out.println("ERROR: file does not exist: " + sFsPath + " hdfs-last-mtime=" + cal.getTime().toString()); return; } if (!fLocal.isFile()) { System.out.println("ERROR: path is not a file: " + sFsPath); return; } if (stat.getLen() != fLocal.length()) { System.out.println("ERROR: length mismatch: " + sFsPath + " hdfslen=" + stat.getLen() + " fslen=" + fLocal.length()); return; } // get local fs checksum FileChecksum ckLocal = getLocalFileChecksum(sFsPath); if (ckLocal == null) { System.out.println("ERROR Failed to get checksum for local file " + sFsPath); return; } // compare checksums as a string, to strip the // algorithm name from the beginning String sCkLocal = ckLocal.toString(); String sCkLocalShort = sCkLocal.replaceAll("^.*:", ""); if (false == sCkShort.equals(sCkLocalShort)) { System.out.println( "ERROR: checksum mismatch: " + sFsPath + "\nhdfs = " + sCk + "\nlocal= " + sCkLocal); return; } } } catch (IOException e) { System.out.println("ERROR: could not open " + p + ": " + e); // System.exit(1) ; } }
From source file:com.turn.camino.Camino.java
License:Open Source License
/** * Render and materialize path/* w w w . j a v a 2 s.c o m*/ * * @param path path to render and materialize * @param renderer renderer * @param context context * @param fileSystem file system * @return path status * @throws InvalidNameException * @throws WrongTypeException * @throws RenderException * @throws IOException */ protected PathStatus renderAndMaterializePath(Path path, Renderer renderer, Context context, FileSystem fileSystem) throws InvalidNameException, WrongTypeException, RenderException, IOException { String name = renderName(path.getName(), renderer, context); String value = validation.requireType(renderer.render(path.getValue(), context), String.class, Message.prefix(String.format("Value of %s", path.getName()))); TimeValue expectedCreationTime = null; if (path.getExpectedCreationTime() != null) { expectedCreationTime = validation.requireType(renderer.render(path.getExpectedCreationTime(), context), TimeValue.class, Message.prefix(String.format("Expected creation time %s must be a time value", path.getExpectedCreationTime()))); } return new PathStatus(name, value, path, materializePath(value, fileSystem), expectedCreationTime); }
From source file:com.twitter.algebra.matrix.format.MapDir.java
License:Apache License
static int extractKeyFromTitle(Path path) { String name = path.getName();// matrix-k-123 name = name.replace("--", "-");//TODO: there is a bug that inserts two - Scanner scanner = new Scanner(name); scanner.useDelimiter("-"); scanner.next();// matrix scanner.next();// k int key = scanner.nextInt(); return key;//w ww . j av a 2 s . c o m }
From source file:com.twitter.algebra.matrix.multiply.MultiplicationDriver.java
License:Apache License
private void run(Configuration conf, Path atPath, Path bPath, Path output, int nRows, int nCols, int k, int nParts) throws Exception { log.info("reading At"); DistributedRowMatrix distAt = new DistributedRowMatrix(atPath, getTempPath(), nRows, nCols); distAt.setConf(conf);/*from ww w.j a v a 2 s .com*/ log.info("reading B"); DistributedRowMatrix distB = new DistributedRowMatrix(bPath, getTempPath(), nRows, k); distB.setConf(conf); log.info("Partitioning At"); distAt = PartitionerJob.run(conf, distAt, nParts, atPath.getName() + "-partitioned" + nParts); log.info("Partitioning B"); distB = PartitionerJob.run(conf, distB, nParts, bPath.getName() + "-partitioned" + nParts); log.info("Computing At x B"); DistributedRowMatrix distXt = AtBOuterStaticMapsideJoinJob.run(conf, distAt, distB, atPath.getName() + "x" + bPath.getName() + "-SMJ"); }
From source file:com.twitter.algebra.matrix.multiply.PartitionerJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows, int partitions) throws IOException, InterruptedException, ClassNotFoundException { @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(PartitionerJob.class); job.setJobName(PartitionerJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(partitions);//from w w w . j av a2 s. com job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(IdMapper.class); job.setReducerClass(IdReducer.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.CombinerJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int aRows) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); // conf.setBoolean("mapreduce.output.compress", true); // conf.setBoolean("mapreduce.output.fileoutputformat.compress", true); // conf.set("mapreduce.output.fileoutputformat.compress.codec", "com.hadoop.compression.lzo.LzoCodec"); conf.setInt("dfs.replication", 20); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(CombinerJob.class); job.setJobName(CombinerJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "combiner"); job.setNumReduceTasks(numReducers);// TODO: make it a parameter job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.setMapperClass(IdMapper.class); job.setReducerClass(MergeVectorsReducer.class); RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, aRows); job.submit();// w ww .ja v a 2s . c o m boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.CompositeDMJ.java
License:Apache License
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, boolean aIsMapDir, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, inMemCStr); conf.setInt(MATRIXINMEMORYROWS, inMemCRows); conf.setInt(MATRIXINMEMORYCOLS, inMemCCols); conf.setFloat(ALPHA1, alpha1);//from ww w .j ava2s. c o m conf.setFloat(ALPHA2, alpha2); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "compositedmj"); conf.set(MAPDIRMATRIX, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(CompositeDMJ.class); job.setJobName(CompositeDMJ.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.nmf.DistRndMatrixJob.java
License:Apache License
public void run(Configuration conf, Path inPath, Path matrixOutputPath, int numInputRows, int numInputCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.setInt(ROWS, numInputRows);/*from www . j a va 2s . c o m*/ conf.setInt(COLS, numInputCols); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(DistRndMatrixJob.class); job.setJobName(DistRndMatrixJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); FileSystem fs = FileSystem.get(inPath.toUri(), conf); inPath = fs.makeQualified(inPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, inPath); job.setInputFormatClass(TextInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(NullWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "random"); job.setNumReduceTasks(numReducers); job.setReducerClass(MyReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.Edge2MapDirJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols, String name) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(INDEXNAME, name);// w w w.jav a2s . c om conf.setInt(ROWS, numInputRows); conf.setInt(COLS, numInputCols); conf.set("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t"); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "edge2matrix"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Edge2MapDirJob.class); job.setJobName(Edge2MapDirJob.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); FileInputFormat.addInputPath(job, matrixInputPath); job.setInputFormatClass(KeyValueTextInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "edge2matrix"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputRows); job.setCombinerClass(MergeVectorsCombiner.class); job.setReducerClass(MergeVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.ErrDMJ.java
License:Apache License
public Job run(Configuration conf, Path xPath, Path matrixAInputPath, Path ytPath, Path outPath, int aRows, int ytRows, int ytCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MAPDIRMATRIXX, xPath.toString()); conf.set(MAPDIRMATRIXYT, ytPath.toString()); conf.setInt(YTROWS, ytRows);/*from ww w. j a v a 2 s . c o m*/ conf.setInt(YTCOLS, ytCols); FileSystem fs = FileSystem.get(outPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixAInputPath, "err"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ErrDMJ.class); job.setJobName(ErrDMJ.class.getSimpleName() + "-" + outPath.getName()); matrixAInputPath = fs.makeQualified(matrixAInputPath); MultipleInputs.addInputPath(job, matrixAInputPath, SequenceFileInputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = 1; job.setNumReduceTasks(numReducers); job.setCombinerClass(SumVectorsReducer.class); job.setReducerClass(SumVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed! "); return job; }