List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:edu.ucsb.cs.hybrid.mappers.IDMapper.java
License:Apache License
public void readIdMappings(JobConf job, Path inputDir) { String strLine = null;/*from ww w. j a va2 s . c om*/ try { FileSystem hdfs = FileSystem.get(job); if (!hdfs.exists(inputDir)) { throw new UnsupportedEncodingException( "ERROR: " + inputDir.getName() + " doesn't exists in hdfs !"); } FileStatus[] cachedFiles = hdfs.listStatus(inputDir); for (int i = 0; i < cachedFiles.length; i++) { Path pt = cachedFiles[i].getPath(); BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt))); while ((strLine = br.readLine()) != null) { StringTokenizer tkz = new StringTokenizer(strLine, ": "); String key = tkz.nextToken(); String value = tkz.nextToken(); md5ToIdMap.put(key.replace(" ", ""), value.replace(" ", "")); } br.close(); } } catch (IOException e) { e.printStackTrace(); } }
From source file:edu.ucsb.cs.lsh.projection.SignaturesGenerator.java
License:Apache License
public static void prepareDistributedCache(JobConf job, FileSystem fs, Path path) throws URISyntaxException, IOException { FileStatus[] files = fs.listStatus(path); System.out.println("path to read from is: " + path.getName()); // remove for (FileStatus file : files) if (fs.isFile(file.getPath()) && !file.getPath().getName().startsWith("_")) DistributedCache.addCacheFile(file.getPath().toUri(), job); }
From source file:edu.ucsb.cs.partitioning.cosine.CosinePartitioning.java
License:Apache License
/** * Job4: Rewrites and merges unnecessary partitioning. *//*from ww w. j av a 2s. c om*/ public static void rewritePartitions(JobConf job) throws IOException { Path outputPath = new Path(PartDriver.OUTPUT_DIR); FileSystem.get(job).delete(outputPath, true); System.out.println(JobSubmitter.stars() + "\n Running Organizer to remove unnecessary partitionins --> " + outputPath.getName()); Organizer.main(interPath, outputPath.getName(), job); FileSystem.get(job).delete(interPath, true); }
From source file:edu.ucsb.cs.partitioning.cosine.Organizer.java
License:Apache License
public static void readCombineCopy(Path input, String output, JobConf job) throws IOException { boolean printDist = job.getBoolean(Config.PRINT_DISTRIBUTION_PROPERTY, Config.PRINT_DISTRIBUTION_VALUE); BufferedWriter distout = null; SequenceFile.Writer out = null; if (printDist) distout = new BufferedWriter(new FileWriter("p-norm-distribution" + output)); int pc = 0, pr = 0; float pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE); FileSystem hdfs = input.getFileSystem(new JobConf()); FileStatus[] files = Partitioner.setFiles(hdfs, input); ArrayList<String> partitions = arrangeNames(files); for (int i = 0; i < partitions.size(); i++) { Path inputPath = new Path(input.toString() + "/" + partitions.get(i)); if (hdfs.isDirectory(inputPath)) continue; SequenceFile.Reader in = new SequenceFile.Reader(hdfs, inputPath, job); if (!isCombined(pr, pc, getRow(inputPath.getName()), getCol(inputPath.getName()), partitions)) { if (out != null) out.close();// w w w. ja va 2 s . com pr = getRow(inputPath.getName()); pc = getCol(inputPath.getName()); out = SequenceFile.createWriter(hdfs, job, new Path(output + "/" + inputPath.getName()), LongWritable.class, FeatureWeightArrayWritable.class, SequenceFile.CompressionType.NONE); } while (in.next(unused, document)) { out.append(new LongWritable(document.id), new FeatureWeightArrayWritable(document.vectorSize, document.vector)); if (printDist) distout.write(document.getPNorm(pChoice) + " \n"); } in.close(); } if (out != null) out.close(); }
From source file:edu.ucsb.cs.partitioning.cosine.Partitioner.java
License:Apache License
/** * //from w w w. j a va 2 s. c o m * @param job * @param inputDir * @param interDir * @param maxDir * @param nPartitions * @param norm_weight_all * @return number of partitions actaully produced */ public static int produceStaticParitions(JobConf job, String inputDir, String interDir, String maxDir, int nPartitions, int norm_weight_all) { SequenceFile.Writer partOut = null; float maxn = 0, maxw = 0, pChoice = job.getFloat(NormSortMain.P_NORM_PROPERTY, NormSortMain.P_NORM_VALUE); int maxs = 0; LongWritable prevK = null, key = new LongWritable(); FeatureWeightArrayWritable prevV = null, value = new FeatureWeightArrayWritable(); try { Path inputPath = new Path(inputDir); FileSystem hdfs = inputPath.getFileSystem(new Configuration()); Path interDirectory = new Path(interDir); Path maxPath = new Path(maxDir); clearPath(hdfs, maxPath); clearPath(hdfs, interDirectory); long nDocuments = Collector.countDirVectors(hdfs, inputPath, job); if (nDocuments == 0) return 0; double partitionSize; uniformPartitions = job.getBoolean(Config.UNIFORM_PARTITIONING_PROPERTY, Config.UNIFORM_PARTITIONING_VALUE); if (uniformPartitions) partitionSize = Math.ceil(nDocuments / (double) nPartitions); else partitionSize = Math.ceil(nDocuments / (double) (GijComparisons.choose(nPartitions + 1, 2))); if (partitionSize == 1) System.err.println("WARN: Number of partitions = number of documents!!"); FileStatus[] files = setFiles(hdfs, inputPath); FSDataOutputStream maxOut = hdfs.create(maxPath); int documentNo = 0, partitionNo = 1; // partition naming start at 1 for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if ((hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_"))) continue; Reader in = new SequenceFile.Reader(hdfs, inputPath, job); while (in.next(key, value)) { // id,vector documentNo++; prevK = key; prevV = value; if (isFirstDocument(partOut)) { maxn = value.getPNorm(pChoice); maxw = value.getMaxWeight(); maxs = value.vectorSize; partOut = openFile(hdfs, job, interDirectory, partitionNo); } partOut.append(key, value); maxw = (value.getMaxWeight() > maxw) ? value.getMaxWeight() : maxw; maxs = (value.vectorSize > maxs) ? value.vectorSize : maxs; maxn = (value.getPNorm(pChoice) > maxn) ? value.getPNorm(pChoice) : maxn; if (isLastDocument(documentNo, partitionNo, partitionSize, uniformPartitions)) { partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs); documentNo = 0; partitionNo++; } prevK = key; prevV = value; } in.close(); } if (partOut != null) partOut = writeMax(norm_weight_all, partOut, maxOut, maxn, maxw, maxs); nPartitions = partitionNo - 1; maxOut.close(); } catch (Exception e) { e.printStackTrace(); } return (nPartitions); }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
public static FileStatus[] getFiles(Path inputPath, FileSystem fs) throws IOException { FileStatus[] files = null;// ww w . j a v a2s .c o m if (fs.exists(inputPath)) { if (fs.isFile(inputPath)) { files = new FileStatus[1]; files[0] = new FileStatus(0, false, 1, 1, 1, inputPath); } else files = fs.listStatus(inputPath); } else throw new FileNotFoundException("Error: " + inputPath.getName() + " does not exist."); return files; }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
public static String getNumMinMaxAvgVecLengthAvgPosting(FileSystem fs, Path inputPath, JobConf job) throws IOException { LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); long numDocuments = 0, minDocLength = Long.MAX_VALUE, maxDocLength = 0; double avgDocLength = 0; int partitionSize; // remove HashMap<Long, Float> partitionfeaturesWeight = new HashMap<Long, Float>(); Iterator<Path> pathItr = getSortedFiles(inputPath, fs); if (!pathItr.hasNext()) return "0,0,0,0"; while (pathItr.hasNext()) { inputPath = pathItr.next();/*from www . jav a 2s. c o m*/ SequenceFile.Reader in = new SequenceFile.Reader(fs, inputPath, job); partitionSize = 0;// remove while (in.next(key, value)) { partitionSize++;// remove numDocuments++; avgDocLength += value.vectorSize; if (minDocLength > value.vectorSize) minDocLength = value.vectorSize; if (maxDocLength < value.vectorSize) maxDocLength = value.vectorSize; for (int j = 0; j < value.vectorSize; j++) { FeatureWeight current = value.vector[j]; updatePartitionBaraglia(partitionfeaturesWeight, current); } } System.out.println(inputPath.getName() + " has " + partitionSize + " vectors."); // remove partitionsWriter.append(new Text(inputPath.getName()), new LongWritable(partitionSize)); in.close(); writePartitionBaraglia(inputPath.getName(), partitionfeaturesWeight); } partitionsWriter.close(); maxWeightVector.clear(); String nFeaturesAvgPost = getNFeaturesAvgPosting(fs, inputPath.getParent(), job); avgDocLength = avgDocLength / numDocuments; return numDocuments + " , " + minDocLength + " , " + maxDocLength + " ," + avgDocLength + " ," + nFeaturesAvgPost; }
From source file:edu.ucsb.cs.partitioning.statistics.Collector.java
License:Apache License
public static long countFileVectors(FileSystem fs, Path inputFile, JobConf job) throws IOException { long nDocuments = 0; LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); if ((fs.isDirectory(inputFile)) || inputFile.getName().startsWith("_")) return 0; SequenceFile.Reader in = new SequenceFile.Reader(fs, inputFile, job); while (in.next(key, value)) nDocuments++;/*from w w w .jav a 2s .co m*/ in.close(); return nDocuments; }
From source file:edu.ucsb.cs.partitioning.statistics.CollectorBaraglia.java
License:Apache License
public static String getNumMinMaxAvgVecLengthAvgPosting(FileSystem fs, Path inputPath, JobConf job) throws IOException { LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); long numDocuments = 0, minDocLength = Long.MAX_VALUE, maxDocLength = 0, numWords = 0; double avgDocLength = 0; HashMap<Long, Float> partitionfeaturesWeight = new HashMap<Long, Float>(); Iterator<Path> pathItr = getSortedFiles(inputPath, fs); if (!pathItr.hasNext()) return "0,0,0,0"; while (pathItr.hasNext()) { inputPath = pathItr.next();/*from w w w .j a v a 2 s . co m*/ SequenceFile.Reader in = new SequenceFile.Reader(fs, inputPath, job); while (in.next(key, value)) { numDocuments++; avgDocLength += value.vectorSize; if (minDocLength > value.vectorSize) minDocLength = value.vectorSize; if (maxDocLength < value.vectorSize) maxDocLength = value.vectorSize; for (int j = 0; j < value.vectorSize; j++) { FeatureWeight current = value.vector[j]; updatePartitionBaraglia(partitionfeaturesWeight, current); } } in.close(); writePartitionBaraglia(inputPath.getName(), partitionfeaturesWeight); } long nFeatures = globalFeaturesCount.keySet().size(); long sumPostings = getSumPostingLength(globalFeaturesCount); avgDocLength = avgDocLength / numDocuments; return numDocuments + " , " + minDocLength + " , " + maxDocLength + " ," + avgDocLength + " ," + nFeatures + " ," + (float) sumPostings / nFeatures; }
From source file:edu.ucsb.cs.partitioning.statistics.DistributionPlotter.java
License:Apache License
public static void main(String[] args) throws IOException { if (args.length != 4) printUsage();/*from ww w.ja v a2 s .c om*/ input = args[0]; output = args[1]; range = Float.parseFloat(args[2]); p = Float.parseFloat(args[3]); Configuration conf = new Configuration(); Path inputPath = new Path(input); FileSystem hdfs = inputPath.getFileSystem(conf); int lineCount = 0; double avg = 0, variance = 0; ArrayList<Float> pnorms = new ArrayList<Float>(); Reader reader = null; if ((!hdfs.exists(inputPath)) || (!hdfs.isDirectory(inputPath))) printUsage(); FileStatus[] files = setFiles(hdfs, inputPath); for (int i = 0; i < files.length; i++) { inputPath = files[i].getPath(); if (hdfs.isDirectory(inputPath) || inputPath.getName().startsWith("_")) continue; System.out.println("Reading file " + inputPath.getName()); // remove reader = new SequenceFile.Reader(hdfs, inputPath, conf); LongWritable key = new LongWritable(); FeatureWeightArrayWritable value = new FeatureWeightArrayWritable(); while (reader.next(key, value)) { float x = value.getPNorm(p); avg += x; pnorms.add(x); int pNorm = findRange(x); if (max < pNorm) max = pNorm; int bar = pNorm; if (historgram.containsKey(bar)) historgram.put(bar, historgram.get(bar) + 1); else historgram.put(bar, 1); lineCount++; } reader.close(); } avg /= lineCount; for (int i = 0; i < pnorms.size(); i++) variance += Math.pow(pnorms.get(i) - avg, 2); variance /= (lineCount - 1); writeHistorgramToFile(output, avg, variance); System.out.println(lineCount + " vectors are processed. "); }