Example usage for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName()

Source Link

Document

Returns the final component of this path.

Usage

From source file:edu.ucsb.cs.partitioning.statistics.rsd.java

License:Apache License

public static void main(String[] args) throws IOException {
    if (args.length != 2) {
        System.out.println("Usage:<input directory of (longWritable,FeatureWeightArrayWritable)> <p-norm>");
        return;// www . j a  v  a 2 s.com
    }
    Configuration conf = new Configuration();
    Path inputPath = new Path(args[0]);
    FileSystem hdfs = inputPath.getFileSystem(conf);
    int lineCount = 0, pnorm = Integer.parseInt(args[1]);
    ArrayList<Float> pnorms = null, norm1 = null;
    Reader reader = null;

    if (!hdfs.exists(inputPath) || hdfs.isFile(inputPath)) {
        System.out.println("\n Input doesn't exists or is not a directory!");
        return;
    }

    FileStatus[] files = setFiles(hdfs, inputPath);
    for (int i = 0; i < files.length; i++) {
        inputPath = files[i].getPath();
        if (hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_"))
            continue;
        System.out.println("Reading file " + inputPath.getName()); // remove
        reader = new SequenceFile.Reader(hdfs, inputPath, conf);

        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        pnorms = new ArrayList<Float>();

        while (reader.next(key, value)) {
            pnorms.add(value.getPNorm(pnorm));
            lineCount++;
        }
    }
    float pnormrstd = getRStd(pnorms);

    System.out
            .println("\nInput has " + lineCount + " records.\n" + pnorm + "-Norm %-RSD = " + (pnormrstd * 100));
    reader.close();
}

From source file:edu.ucsb.cs.preprocessing.sequence.SeqMapper.java

License:Apache License

public void readIdMappings(JobConf job, Path inputDir) {
    String strLine = null;//from   w  w w.  j a v a 2 s . c o m
    try {
        FileSystem hdfs = FileSystem.get(job);
        if (!hdfs.exists(inputDir)) {
            throw new UnsupportedEncodingException(
                    "ERROR: " + inputDir.getName() + " doesn't exists in hdfs !");
        }
        FileStatus[] cachedFiles = hdfs.listStatus(inputDir);
        for (int i = 0; i < cachedFiles.length; i++) {
            Path pt = cachedFiles[i].getPath();
            BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt)));
            while ((strLine = br.readLine()) != null) {
                StringTokenizer tkz = new StringTokenizer(strLine, ":: ");
                String key = tkz.nextToken();
                String value = tkz.nextToken();
                serialToIdMap.put(key.replace(" ", ""), value.replace(" ", ""));
            }
            br.close();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.umd.cloud9.collection.trecweb.TrecWebDocnoMappingBuilder.java

License:Apache License

@Override
public int run(String[] args) throws IOException {
    DocnoMapping.DefaultBuilderOptions options = DocnoMapping.BuilderUtils.parseDefaultOptions(args);
    if (options == null) {
        return -1;
    }//from w w  w.j a  v a  2  s  . c  o  m

    // Temp directory.
    String tmpDir = "tmp-" + TrecWebDocnoMappingBuilder.class.getSimpleName() + "-" + random.nextInt(10000);

    LOG.info("Tool name: " + TrecWebDocnoMappingBuilder.class.getCanonicalName());
    LOG.info(" - input path: " + options.collection);
    LOG.info(" - output file: " + options.docnoMapping);

    Job job = new Job(getConf(), TrecWebDocnoMappingBuilder.class.getSimpleName() + ":" + options.collection);
    FileSystem fs = FileSystem.get(job.getConfiguration());

    job.setJarByClass(TrecWebDocnoMappingBuilder.class);

    job.setNumReduceTasks(1);

    PathFilter filter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return !path.getName().startsWith("_");
        }
    };

    // Note: Gov2 and Wt10g raw collections are organized into sub-directories.
    Path collectionPath = new Path(options.collection);
    for (FileStatus status : fs.listStatus(collectionPath, filter)) {
        if (status.isDirectory()) {
            for (FileStatus s : fs.listStatus(status.getPath(), filter)) {
                FileInputFormat.addInputPath(job, s.getPath());
            }
        } else {
            FileInputFormat.addInputPath(job, status.getPath());
        }
    }
    FileOutputFormat.setOutputPath(job, new Path(tmpDir));
    FileOutputFormat.setCompressOutput(job, false);

    job.setInputFormatClass(options.inputFormat);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already.
    fs.delete(new Path(tmpDir), true);

    try {
        job.waitForCompletion(true);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }

    writeMappingData(new Path(tmpDir + "/part-r-00000"), new Path(options.docnoMapping), fs);
    fs.delete(new Path(tmpDir), true);

    return 0;
}

From source file:edu.umn.cs.spatialHadoop.core.GridRecordWriter.java

License:Open Source License

/**
 * Close the given cell freeing all memory reserved by it.
 * Once a cell is closed, we should not write more data to it.
 * @param intermediateCellPath//from  www  .  ja  v  a2  s.co  m
 * @param finalCellPath
 * @param intermediateCellStream
 * @param masterFile
 * @param cellMbr
 * @param recordCount
 * @param cellSize
 * @throws IOException
 */
protected void closeCellBackground(final Path intermediateCellPath, final Path finalCellPath,
        final OutputStream intermediateCellStream, final OutputStream masterFile, final CellInfo cellMbr,
        final long recordCount, final long cellSize) throws IOException {

    Thread closingThread = new Thread() {
        @Override
        public void run() {
            try {
                Path finalfinalCellPath = flushAllEntries(intermediateCellPath, intermediateCellStream,
                        finalCellPath);
                // Write an entry to the master file

                // Write a line to the master file including file name and cellInfo
                if (masterFile != null) {
                    Partition partition = new Partition(finalfinalCellPath.getName(), cellMbr);
                    partition.recordCount = recordCount;
                    partition.size = cellSize;
                    Text line = partition.toText(new Text());
                    masterFile.write(line.getBytes(), 0, line.getLength());
                    masterFile.write(NEW_LINE);
                }
            } catch (IOException e) {
                throw new RuntimeException("Error closing thread", e);
            }
        }
    };

    closingThreads.add(closingThread);
    // Remove previously terminated threads
    while (!closingThreads.isEmpty() && closingThreads.get(0).getState() == Thread.State.TERMINATED) {
        closingThreads.remove(0);
    }
    // Start first thread (if exists)
    if (!closingThreads.isEmpty() && closingThreads.get(0).getState() == Thread.State.NEW)
        closingThreads.get(0).start();
}

From source file:edu.umn.cs.spatialHadoop.core.Partitioner.java

License:Open Source License

/**
 * Sets the class and value of a partitioner in the given job
 * @param conf/*from  w  ww  .  ja  va  2  s  . c  om*/
 * @param partitioner
 * @throws IOException
 */
public static void setPartitioner(Configuration conf, Partitioner partitioner) throws IOException {
    conf.setClass(PartitionerClass, partitioner.getClass(), Partitioner.class);
    Path tempFile;
    FileSystem fs = FileSystem.get(conf);
    do {
        tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".partitions");
    } while (fs.exists(tempFile));
    FSDataOutputStream out = fs.create(tempFile);
    partitioner.write(out);
    out.close();

    fs.deleteOnExit(tempFile);

    DistributedCache.addCacheFile(tempFile.toUri(), conf);
    conf.set(PartitionerValue, tempFile.getName());
}

From source file:edu.umn.cs.spatialHadoop.core.Partitioner.java

License:Open Source License

/**
 * Retrieves the value of a partitioner for a given job.
 * @param conf/*from  w ww.j a  v a 2 s  .  com*/
 * @return
 */
public static Partitioner getPartitioner(Configuration conf) {
    Class<? extends Partitioner> klass = conf.getClass(PartitionerClass, Partitioner.class)
            .asSubclass(Partitioner.class);
    if (klass == null)
        return null;
    try {
        Partitioner partitioner = klass.newInstance();

        String partitionerFile = conf.get(PartitionerValue);
        if (partitionerFile != null) {
            Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);
            for (Path cacheFile : cacheFiles) {
                if (cacheFile.getName().contains(partitionerFile)) {
                    FSDataInputStream in = FileSystem.getLocal(conf).open(cacheFile);
                    partitioner.readFields(in);
                    in.close();
                }
            }
        }
        return partitioner;
    } catch (InstantiationException e) {
        Log.warn("Error instantiating partitioner", e);
        return null;
    } catch (IllegalAccessException e) {
        Log.warn("Error instantiating partitioner", e);
        return null;
    } catch (IOException e) {
        Log.warn("Error retrieving partitioner value", e);
        return null;
    }
}

From source file:edu.umn.cs.spatialHadoop.core.RTreeGridRecordWriter.java

License:Open Source License

@Override
protected Path getFinalCellPath(int cellIndex) throws IOException {
    Path finalCellPath = super.getFinalCellPath(cellIndex);
    return new Path(finalCellPath.getParent(), finalCellPath.getName() + ".rtree");
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Ensures that the given class is in the class path of running jobs.
 * If the jar is not already in the class path, it is added to the
 * DisributedCache of the given job to ensure the associated job will work
 * fine.//from   w  w  w  .j  av  a 2s .  com
 * @param conf
 * @param klass
 */
public static void addClassToPath(Configuration conf, Class<?> klass) {
    // Check if we need to add the containing jar to class path
    String klassJar = findContainingJar(klass);
    String shadoopJar = findContainingJar(SpatialSite.class);
    if (klassJar == null || (shadoopJar != null && klassJar.equals(shadoopJar)))
        return;
    Path containingJar = new Path(findContainingJar(klass));
    Path[] existingClassPaths = DistributedCache.getArchiveClassPaths(conf);
    if (existingClassPaths != null) {
        for (Path existingClassPath : existingClassPaths) {
            if (containingJar.getName().equals(existingClassPath.getName()))
                return;
        }
    }
    // The containing jar is a new one and needs to be copied to class path
    try {
        LOG.info("Adding JAR '" + containingJar.getName() + "' to job class path");
        FileSystem defaultFS = FileSystem.get(conf);
        Path libFolder;
        if (existingClassPaths != null && existingClassPaths.length > 0) {
            libFolder = existingClassPaths[0].getParent();
        } else {
            // First jar to be added like this. Create a new lib folder
            do {
                libFolder = new Path("lib_" + (int) (Math.random() * 100000));
            } while (defaultFS.exists(libFolder));
            defaultFS.mkdirs(libFolder);
            defaultFS.deleteOnExit(libFolder);
        }
        defaultFS.copyFromLocalFile(containingJar, libFolder);
        Path jarFullPath = new Path(libFolder, containingJar.getName()).makeQualified(defaultFS);
        jarFullPath = jarFullPath.makeQualified(defaultFS);
        DistributedCache.addArchiveToClassPath(jarFullPath, conf);
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Set an array of cells in the job configuration. As the array might be
 * very large to store as one value, an alternative approach is used.
 * The cells are all written to a temporary file, and that file is added
 * to the DistributedCache of the job. Later on, a call to
 * {@link #getCells(Configuration)} will open the corresponding file from
 * DistributedCache and parse cells from that file.
 * @param conf/*w  ww. j a v a  2  s. c  o m*/
 * @param cellsInfo
 * @throws IOException
 */
public static void setCells(Configuration conf, CellInfo[] cellsInfo) throws IOException {
    Path tempFile;
    FileSystem fs = FileSystem.get(conf);
    do {
        tempFile = new Path("cells_" + (int) (Math.random() * 1000000) + ".cells");
    } while (fs.exists(tempFile));
    FSDataOutputStream out = fs.create(tempFile);
    out.writeInt(cellsInfo.length);
    for (CellInfo cell : cellsInfo) {
        cell.write(out);
    }
    out.close();

    fs.deleteOnExit(tempFile);

    DistributedCache.addCacheFile(tempFile.toUri(), conf);
    conf.set(OUTPUT_CELLS, tempFile.getName());
    LOG.info("Partitioning file into " + cellsInfo.length + " cells");
}

From source file:edu.umn.cs.spatialHadoop.core.SpatialSite.java

License:Open Source License

/**
 * Retrieves cells that were stored earlier using
 * {@link #setCells(Configuration, CellInfo[])}
 * This function opens the corresponding
 * file from DistributedCache and parses jobs from it.
 * @param conf//  w  w w. jav a  2s .c  o  m
 * @return
 * @throws IOException
 */
public static CellInfo[] getCells(Configuration conf) throws IOException {
    CellInfo[] cells = null;
    String cells_file = conf.get(OUTPUT_CELLS);
    if (cells_file != null) {
        Path[] cacheFiles = DistributedCache.getLocalCacheFiles(conf);
        for (Path cacheFile : cacheFiles) {
            if (cacheFile.getName().contains(cells_file)) {
                FSDataInputStream in = FileSystem.getLocal(conf).open(cacheFile);

                int cellCount = in.readInt();
                cells = new CellInfo[cellCount];
                for (int i = 0; i < cellCount; i++) {
                    cells[i] = new CellInfo();
                    cells[i].readFields(in);
                }

                in.close();
            }
        }
    }
    return cells;
}