Example usage for org.apache.hadoop.fs Path getName

List of usage examples for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName() 

Source Link

Document

Returns the final component of this path.

Usage

From source file:edu.ucsb.cs.hybrid.mappers.IDMapper.java

License:Apache License

public void readIdMappings(JobConf job, Path inputDir) {
    String strLine = null;/*from   ww w.  j a va2  s . c om*/
    try {
        FileSystem hdfs = FileSystem.get(job);
        if (!hdfs.exists(inputDir)) {
            throw new UnsupportedEncodingException(
                    "ERROR: " + inputDir.getName() + " doesn't exists in hdfs !");
        }
        FileStatus[] cachedFiles = hdfs.listStatus(inputDir);
        for (int i = 0; i < cachedFiles.length; i++) {
            Path pt = cachedFiles[i].getPath();
            BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt)));
            while ((strLine = br.readLine()) != null) {
                StringTokenizer tkz = new StringTokenizer(strLine, ": ");
                String key = tkz.nextToken();
                String value = tkz.nextToken();
                md5ToIdMap.put(key.replace(" ", ""), value.replace(" ", ""));
            }
            br.close();
        }
    } catch (IOException e) {
        e.printStackTrace();
    }
}

From source file:edu.ucsb.cs.lsh.projection.SignaturesGenerator.java

License:Apache License

public static void prepareDistributedCache(JobConf job, FileSystem fs, Path path)
        throws URISyntaxException, IOException {
    FileStatus[] files = fs.listStatus(path);
    System.out.println("path to read from is: " + path.getName()); // remove
    for (FileStatus file : files)
        if (fs.isFile(file.getPath()) && !file.getPath().getName().startsWith("_"))
            DistributedCache.addCacheFile(file.getPath().toUri(), job);
}

From source file:edu.ucsb.cs.partitioning.cosine.CosinePartitioning.java

License:Apache License

/**
 * Job4: Rewrites and merges unnecessary partitioning.
 *//*from   ww w. j av  a  2s. c  om*/
public static void rewritePartitions(JobConf job) throws IOException {
    Path outputPath = new Path(PartDriver.OUTPUT_DIR);
    FileSystem.get(job).delete(outputPath, true);
    System.out.println(JobSubmitter.stars() + "\n Running Organizer to remove unnecessary partitionins --> "
            + outputPath.getName());
    Organizer.main(interPath, outputPath.getName(), job);
    FileSystem.get(job).delete(interPath, true);
}

From source file:edu.ucsb.cs.partitioning.cosine.Organizer.java

License:Apache License

public static void readCombineCopy(Path input, String output, JobConf job) throws IOException {
    boolean printDist = job.getBoolean(Config.PRINT_DISTRIBUTION_PROPERTY, Config.PRINT_DISTRIBUTION_VALUE);
    BufferedWriter distout = null;
    SequenceFile.Writer out = null;
    if (printDist)
        distout = new BufferedWriter(new FileWriter("p-norm-distribution" + output));

    int pc = 0, pr = 0;
    float pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE);
    FileSystem hdfs = input.getFileSystem(new JobConf());
    FileStatus[] files = Partitioner.setFiles(hdfs, input);
    ArrayList<String> partitions = arrangeNames(files);

    for (int i = 0; i < partitions.size(); i++) {
        Path inputPath = new Path(input.toString() + "/" + partitions.get(i));
        if (hdfs.isDirectory(inputPath))
            continue;

        SequenceFile.Reader in = new SequenceFile.Reader(hdfs, inputPath, job);
        if (!isCombined(pr, pc, getRow(inputPath.getName()), getCol(inputPath.getName()), partitions)) {
            if (out != null)
                out.close();//  w w w. ja va  2 s . com
            pr = getRow(inputPath.getName());
            pc = getCol(inputPath.getName());
            out = SequenceFile.createWriter(hdfs, job, new Path(output + "/" + inputPath.getName()),
                    LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE);
        }
        while (in.next(unused, document)) {
            out.append(new LongWritable(document.id),
                    new FeatureWeightArrayWritable(document.vectorSize, document.vector));
            if (printDist)
                distout.write(document.getPNorm(pChoice) + " \n");
        }
        in.close();
    }
    if (out != null)
        out.close();
}

From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java

License:Apache License

/**
 * //from   w w  w. j  a  va 2  s.  c o  m
 * @param job
 * @param inputDir
 * @param interDir
 * @param maxDir
 * @param nPartitions
 * @param norm_weight_all
 * @return number of partitions actaully produced
 */
public static int produceStaticParitions(JobConf job, String inputDir, String interDir, String maxDir,
        int nPartitions, int norm_weight_all) {
    SequenceFile.Writer partOut = null;
    float maxn = 0, maxw = 0, pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE);
    int maxs = 0;
    LongWritable prevK = null, key = new LongWritable();
    FeatureWeightArrayWritable prevV = null, value = new FeatureWeightArrayWritable();

    try {
        Path inputPath = new Path(inputDir);
        FileSystem hdfs = inputPath.getFileSystem(new Configuration());
        Path interDirectory = new Path(interDir);
        Path maxPath = new Path(maxDir);

        clearPath(hdfs, maxPath);
        clearPath(hdfs, interDirectory);

        long nDocuments = Collector.countDirVectors(hdfs, inputPath, job);
        if (nDocuments == 0)
            return 0;

        double partitionSize;
        uniformPartitions = job.getBoolean(Config.UNIFORM_PARTITIONING_PROPERTY,
                Config.UNIFORM_PARTITIONING_VALUE);
        if (uniformPartitions)
            partitionSize = Math.ceil(nDocuments / (double) nPartitions);
        else
            partitionSize = Math.ceil(nDocuments / (double) (GijComparisons.choose(nPartitions + 1, 2)));

        if (partitionSize == 1)
            System.err.println("WARN: Number of partitions = number of documents!!");

        FileStatus[] files = setFiles(hdfs, inputPath);
        FSDataOutputStream maxOut = hdfs.create(maxPath);

        int documentNo = 0, partitionNo = 1; // partition naming start at 1
        for (int i = 0; i < files.length; i++) {
            inputPath = files[i].getPath();
            if ((hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_")))
                continue;
            Reader in = new SequenceFile.Reader(hdfs, inputPath, job);

            while (in.next(key, value)) { // id,vector
                documentNo++;
                prevK = key;
                prevV = value;

                if (isFirstDocument(partOut)) {
                    maxn = value.getPNorm(pChoice);
                    maxw = value.getMaxWeight();
                    maxs = value.vectorSize;
                    partOut = openFile(hdfs, job, interDirectory, partitionNo);
                }
                partOut.append(key, value);
                maxw = (value.getMaxWeight() > maxw) ? value.getMaxWeight() : maxw;
                maxs = (value.vectorSize > maxs) ? value.vectorSize : maxs;
                maxn = (value.getPNorm(pChoice) > maxn) ? value.getPNorm(pChoice) : maxn;

                if (isLastDocument(documentNo, partitionNo, partitionSize, uniformPartitions)) {
                    partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs);
                    documentNo = 0;
                    partitionNo++;
                }
                prevK = key;
                prevV = value;
            }
            in.close();
        }
        if (partOut != null)
            partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs);
        nPartitions = partitionNo - 1;
        maxOut.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return (nPartitions);
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

public static FileStatus[] getFiles(Path inputPath, FileSystem fs) throws IOException {

    FileStatus[] files = null;//  ww  w  .  j  a v a2s .c o m
    if (fs.exists(inputPath)) {
        if (fs.isFile(inputPath)) {
            files = new FileStatus[1];
            files[0] = new FileStatus(0, false, 1, 1, 1, inputPath);
        } else
            files = fs.listStatus(inputPath);
    } else
        throw new FileNotFoundException("Error: " + inputPath.getName() + " does not exist.");
    return files;
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

public static String getNumMinMaxAvgVecLengthAvgPosting(FileSystem fs, Path inputPath, JobConf job)
        throws IOException {

    LongWritable key = new LongWritable();
    FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();
    long numDocuments = 0, minDocLength = Long.MAX_VALUE, maxDocLength = 0;
    double avgDocLength = 0;
    int partitionSize; // remove

    HashMap<Long, Float> partitionfeaturesWeight = new HashMap<Long, Float>();

    Iterator<Path> pathItr = getSortedFiles(inputPath, fs);
    if (!pathItr.hasNext())
        return "0,0,0,0";

    while (pathItr.hasNext()) {
        inputPath = pathItr.next();/*from www .  jav a  2s.  c  o m*/
        SequenceFile.Reader in = new SequenceFile.Reader(fs, inputPath, job);
        partitionSize = 0;// remove
        while (in.next(key, value)) {
            partitionSize++;// remove
            numDocuments++;
            avgDocLength += value.vectorSize;
            if (minDocLength > value.vectorSize)
                minDocLength = value.vectorSize;
            if (maxDocLength < value.vectorSize)
                maxDocLength = value.vectorSize;

            for (int j = 0; j < value.vectorSize; j++) {
                FeatureWeight current = value.vector[j];
                updatePartitionBaraglia(partitionfeaturesWeight, current);
            }
        }
        System.out.println(inputPath.getName() + " has " + partitionSize + " vectors."); // remove
        partitionsWriter.append(new Text(inputPath.getName()), new LongWritable(partitionSize));
        in.close();
        writePartitionBaraglia(inputPath.getName(), partitionfeaturesWeight);
    }
    partitionsWriter.close();
    maxWeightVector.clear();
    String nFeaturesAvgPost = getNFeaturesAvgPosting(fs, inputPath.getParent(), job);
    avgDocLength = avgDocLength / numDocuments;

    return numDocuments + " , " + minDocLength + " , " + maxDocLength + " ," + avgDocLength + " ,"
            + nFeaturesAvgPost;
}

From source file:edu.ucsb.cs.partitioning.statistics.Collector.java

License:Apache License

public static long countFileVectors(FileSystem fs, Path inputFile, JobConf job) throws IOException {
    long nDocuments = 0;
    LongWritable key = new LongWritable();
    FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

    if ((fs.isDirectory(inputFile)) || inputFile.getName().startsWith("_"))
        return 0;
    SequenceFile.Reader in = new SequenceFile.Reader(fs, inputFile, job);
    while (in.next(key, value))
        nDocuments++;/*from   w  w  w .jav a  2s  .co m*/
    in.close();
    return nDocuments;
}

From source file:edu.ucsb.cs.partitioning.statistics.CollectorBaraglia.java

License:Apache License

public static String getNumMinMaxAvgVecLengthAvgPosting(FileSystem fs, Path inputPath, JobConf job)
        throws IOException {

    LongWritable key = new LongWritable();
    FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();
    long numDocuments = 0, minDocLength = Long.MAX_VALUE, maxDocLength = 0, numWords = 0;
    double avgDocLength = 0;

    HashMap<Long, Float> partitionfeaturesWeight = new HashMap<Long, Float>();

    Iterator<Path> pathItr = getSortedFiles(inputPath, fs);
    if (!pathItr.hasNext())
        return "0,0,0,0";

    while (pathItr.hasNext()) {
        inputPath = pathItr.next();/*from w  w w  .j a  v a  2 s  . co m*/
        SequenceFile.Reader in = new SequenceFile.Reader(fs, inputPath, job);
        while (in.next(key, value)) {
            numDocuments++;
            avgDocLength += value.vectorSize;
            if (minDocLength > value.vectorSize)
                minDocLength = value.vectorSize;
            if (maxDocLength < value.vectorSize)
                maxDocLength = value.vectorSize;
            for (int j = 0; j < value.vectorSize; j++) {
                FeatureWeight current = value.vector[j];
                updatePartitionBaraglia(partitionfeaturesWeight, current);
            }
        }
        in.close();
        writePartitionBaraglia(inputPath.getName(), partitionfeaturesWeight);
    }
    long nFeatures = globalFeaturesCount.keySet().size();
    long sumPostings = getSumPostingLength(globalFeaturesCount);
    avgDocLength = avgDocLength / numDocuments;
    return numDocuments + " , " + minDocLength + " , " + maxDocLength + " ," + avgDocLength + " ," + nFeatures
            + " ," + (float) sumPostings / nFeatures;
}

From source file:edu.ucsb.cs.partitioning.statistics.DistributionPlotter.java

License:Apache License

public static void main(String[] args) throws IOException {

    if (args.length != 4)
        printUsage();/*from ww  w.ja v  a2  s .c  om*/

    input = args[0];
    output = args[1];
    range = Float.parseFloat(args[2]);
    p = Float.parseFloat(args[3]);

    Configuration conf = new Configuration();
    Path inputPath = new Path(input);
    FileSystem hdfs = inputPath.getFileSystem(conf);
    int lineCount = 0;
    double avg = 0, variance = 0;
    ArrayList<Float> pnorms = new ArrayList<Float>();
    Reader reader = null;

    if ((!hdfs.exists(inputPath)) || (!hdfs.isDirectory(inputPath)))
        printUsage();

    FileStatus[] files = setFiles(hdfs, inputPath);
    for (int i = 0; i < files.length; i++) {
        inputPath = files[i].getPath();
        if (hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_"))
            continue;
        System.out.println("Reading file " + inputPath.getName()); // remove
        reader = new SequenceFile.Reader(hdfs, inputPath, conf);

        LongWritable key = new LongWritable();
        FeatureWeightArrayWritable value = new FeatureWeightArrayWritable();

        while (reader.next(key, value)) {
            float x = value.getPNorm(p);
            avg += x;
            pnorms.add(x);
            int pNorm = findRange(x);
            if (max < pNorm)
                max = pNorm;
            int bar = pNorm;
            if (historgram.containsKey(bar))
                historgram.put(bar, historgram.get(bar) + 1);
            else
                historgram.put(bar, 1);
            lineCount++;
        }
        reader.close();
    }
    avg /= lineCount;
    for (int i = 0; i < pnorms.size(); i++)
        variance += Math.pow(pnorms.get(i) - avg, 2);
    variance /= (lineCount - 1);
    writeHistorgramToFile(output, avg, variance);
    System.out.println(lineCount + " vectors are processed. ");
}