List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:edu.ucsb.cs.partitioning.statistics.rsd.java
License:Apache License
public static void main(String[] args) throws IOException { if (args.length != 2) { System.out.println("Usage:<input directory of (longWritable,FeatureWeightArrayWritable)> <p-norm>"); return;// www . j a v a 2 s.com } Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); FileSystem hdfs = inputPath.getFileSystem(conf); int lineCount = 0, pnorm = Integer.parseInt(args[1]); ArrayList<Float> pnorms = null, norm1 = null; Reader reader = null; if (!hdfs.exists(inputPath) || hdfs.isFile(inputPath)) { System.out.println("\n Input doesn't exists or is not a directory!"); return; } FileStatus[] files = setFiles(hdfs, inputPath); for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if (hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_")) continue; System.out.println("Reading file " + inputPath.getName()); // remove reader = new SequenceFile.Reader(hdfs, inputPath, conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); pnorms = new ArrayList<Float>(); while (reader.next(key, value)) { pnorms.add(value.getPNorm(pnorm)); lineCount++; } } float pnormrstd = getRStd(pnorms); System.out .println("\nInput has " + lineCount + " records.\n" + pnorm + "-Norm %-RSD = " + (pnormrstd * 100)); reader.close(); }
From source file:edu.ucsb.cs.preprocessing.sequence.SeqMapper.java
License:Apache License
public void readIdMappings(JobConf job, Path inputDir) { String strLine = null;//from w w w. j a v a 2 s . c o m try { FileSystem hdfs = FileSystem.get(job); if (!hdfs.exists(inputDir)) { throw new UnsupportedEncodingException( "ERROR: " + inputDir.getName() + " doesn't exists in hdfs !"); } FileStatus[] cachedFiles = hdfs.listStatus(inputDir); for (int i = 0; i < cachedFiles.length; i++) { Path pt = cachedFiles[i].getPath(); BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt))); while ((strLine = br.readLine()) != null) { StringTokenizer tkz = new StringTokenizer(strLine, ":: "); String key = tkz.nextToken(); String value = tkz.nextToken(); serialToIdMap.put(key.replace(" ", ""), value.replace(" ", "")); } br.close(); } } catch (IOException e) { e.printStackTrace(); } }
From source file:edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.java
License:Apache License
@Override public int run(String[] args) throws IOException { DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args); if (options == null) { return -1; }//from w w w.j a v a 2 s . c o m // Temp directory. String tmpDir = "tmp-" + TrecWebDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000); LOG.info("Tool name: " + TrecWebDocnoMappingBuilder.class.getCanonicalName()); LOG.info(" - input path: " + options.collection); LOG.info(" - output file: " + options.docnoMapping); Job job = new Job(getConf(), TrecWebDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection); FileSystem fs = FileSystem.get(job.getConfiguration()); job.setJarByClass(TrecWebDocnoMappingBuilder.class); job.setNumReduceTasks(1); PathFilter filter = new PathFilter() { @Override public boolean accept(Path path) { return !path.getName().startsWith("_"); } }; // Note: Gov2 and Wt10g raw collections are organized into sub-directories. Path collectionPath = new Path(options.collection); for (FileStatus status : fs.listStatus(collectionPath, filter)) { if (status.isDirectory()) { for (FileStatus s : fs.listStatus(status.getPath(), filter)) { FileInputFormat.addInputPath(job, s.getPath()); } } else { FileInputFormat.addInputPath(job, status.getPath()); } } FileOutputFormat.setOutputPath(job, new Path(tmpDir)); FileOutputFormat.setCompressOutput(job, false); job.setInputFormatClass(options.inputFormat); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapperClass(MyMapper.class); job.setReducerClass(MyReducer.class); // Delete the output directory if it exists already. fs.delete(new Path(tmpDir), true); try { job.waitForCompletion(true); } catch (Exception e) { throw new RuntimeException(e); } writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs); fs.delete(new Path(tmpDir), true); return 0; }
From source file:edu.umn.cs.spatialHadoop.core.GridRecordWriter.java
License:Open Source License
/** * Close the given cell freeing all memory reserved by it. * Once a cell is closed, we should not write more data to it. * @param intermediateCellPath//from www . ja v a2 s.co m * @param finalCellPath * @param intermediateCellStream * @param masterFile * @param cellMbr * @param recordCount * @param cellSize * @throws IOException */ protected void closeCellBackground(final Path intermediateCellPath, final Path finalCellPath, final OutputStream intermediateCellStream, final OutputStream masterFile, final CellInfo cellMbr, final long recordCount, final long cellSize) throws IOException { Thread closingThread = new Thread() { @Override public void run() { try { Path finalfinalCellPath = flushAllEntries(intermediateCellPath, intermediateCellStream, finalCellPath); // Write an entry to the master file // Write a line to the master file including file name and cellInfo if (masterFile != null) { Partition partition = new Partition(finalfinalCellPath.getName(), cellMbr); partition.recordCount = recordCount; partition.size = cellSize; Text line = partition.toText(new Text()); masterFile.write(line.getBytes(), 0, line.getLength()); masterFile.write(NEW_LINE); } } catch (IOException e) { throw new RuntimeException("Error closing thread", e); } } }; closingThreads.add(closingThread); // Remove previously terminated threads while (!closingThreads.isEmpty() && closingThreads.get(0).getState() == Thread.State.TERMINATED) { closingThreads.remove(0); } // Start first thread (if exists) if (!closingThreads.isEmpty() && closingThreads.get(0).getState() == Thread.State.NEW) closingThreads.get(0).start(); }
From source file:edu.umn.cs.spatialHadoop.core.Partitioner.java
License:Open Source License
/** * Sets the class and value of a partitioner in the given job * @param conf/*from w ww . ja va 2 s . c om*/ * @param partitioner * @throws IOException */ public static void setPartitioner(Configuration conf, Partitioner partitioner) throws IOException { conf.setClass(PartitionerClass, partitioner.getClass(), Partitioner.class); Path tempFile; FileSystem fs = FileSystem.get(conf); do { tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".partitions"); } while (fs.exists(tempFile)); FSDataOutputStream out = fs.create(tempFile); partitioner.write(out); out.close(); fs.deleteOnExit(tempFile); DistributedCache.addCacheFile(tempFile.toUri(), conf); conf.set(PartitionerValue, tempFile.getName()); }
From source file:edu.umn.cs.spatialHadoop.core.Partitioner.java
License:Open Source License
/** * Retrieves the value of a partitioner for a given job. * @param conf/*from w ww.j a v a 2 s . com*/ * @return */ public static Partitioner getPartitioner(Configuration conf) { Class<? extends Partitioner> klass = conf.getClass(PartitionerClass, Partitioner.class) .asSubclass(Partitioner.class); if (klass == null) return null; try { Partitioner partitioner = klass.newInstance(); String partitionerFile = conf.get(PartitionerValue); if (partitionerFile != null) { Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf); for (Path cacheFile : cacheFiles) { if (cacheFile.getName().contains(partitionerFile)) { FSDataInputStream in = FileSystem.getLocal(conf).open(cacheFile); partitioner.readFields(in); in.close(); } } } return partitioner; } catch (InstantiationException e) { Log.warn("Error instantiating partitioner", e); return null; } catch (IllegalAccessException e) { Log.warn("Error instantiating partitioner", e); return null; } catch (IOException e) { Log.warn("Error retrieving partitioner value", e); return null; } }
From source file:edu.umn.cs.spatialHadoop.core.RTreeGridRecordWriter.java
License:Open Source License
@Override protected Path getFinalCellPath(int cellIndex) throws IOException { Path finalCellPath = super.getFinalCellPath(cellIndex); return new Path(finalCellPath.getParent(), finalCellPath.getName() + ".rtree"); }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Ensures that the given class is in the class path of running jobs. * If the jar is not already in the class path, it is added to the * DisributedCache of the given job to ensure the associated job will work * fine.//from w w w .j av a 2s . com * @param conf * @param klass */ public static void addClassToPath(Configuration conf, Class<?> klass) { // Check if we need to add the containing jar to class path String klassJar = findContainingJar(klass); String shadoopJar = findContainingJar(SpatialSite.class); if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar))) return; Path containingJar = new Path(findContainingJar(klass)); Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf); if (existingClassPaths != null) { for (Path existingClassPath : existingClassPaths) { if (containingJar.getName().equals(existingClassPath.getName())) return; } } // The containing jar is a new one and needs to be copied to class path try { LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path"); FileSystem defaultFS = FileSystem.get(conf); Path libFolder; if (existingClassPaths != null && existingClassPaths.length > 0) { libFolder = existingClassPaths[0].getParent(); } else { // First jar to be added like this. Create a new lib folder do { libFolder = new Path("lib_" + (int) (Math.random() * 100000)); } while (defaultFS.exists(libFolder)); defaultFS.mkdirs(libFolder); defaultFS.deleteOnExit(libFolder); } defaultFS.copyFromLocalFile(containingJar, libFolder); Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS); jarFullPath = jarFullPath.makeQualified(defaultFS); DistributedCache.addArchiveToClassPath(jarFullPath, conf); } catch (IOException e) { e.printStackTrace(); } }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Set an array of cells in the job configuration. As the array might be * very large to store as one value, an alternative approach is used. * The cells are all written to a temporary file, and that file is added * to the DistributedCache of the job. Later on, a call to * {@link #getCells(Configuration)} will open the corresponding file from * DistributedCache and parse cells from that file. * @param conf/*w ww. j a v a 2 s. c o m*/ * @param cellsInfo * @throws IOException */ public static void setCells(Configuration conf, CellInfo[] cellsInfo) throws IOException { Path tempFile; FileSystem fs = FileSystem.get(conf); do { tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".cells"); } while (fs.exists(tempFile)); FSDataOutputStream out = fs.create(tempFile); out.writeInt(cellsInfo.length); for (CellInfo cell : cellsInfo) { cell.write(out); } out.close(); fs.deleteOnExit(tempFile); DistributedCache.addCacheFile(tempFile.toUri(), conf); conf.set(OUTPUT_CELLS, tempFile.getName()); LOG.info("Partitioning file into " + cellsInfo.length + " cells"); }
From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java
License:Open Source License
/** * Retrieves cells that were stored earlier using * {@link #setCells(Configuration, CellInfo[])} * This function opens the corresponding * file from DistributedCache and parses jobs from it. * @param conf// w w w. jav a 2s .c o m * @return * @throws IOException */ public static CellInfo[] getCells(Configuration conf) throws IOException { CellInfo[] cells = null; String cells_file = conf.get(OUTPUT_CELLS); if (cells_file != null) { Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf); for (Path cacheFile : cacheFiles) { if (cacheFile.getName().contains(cells_file)) { FSDataInputStream in = FileSystem.getLocal(conf).open(cacheFile); int cellCount = in.readInt(); cells = new CellInfo[cellCount]; for (int i = 0; i < cellCount; i++) { cells[i] = new CellInfo(); cells[i].readFields(in); } in.close(); } } } return cells; }