List of usage examples for org.apache.hadoop.fs Path getName
public String getName()
From source file:com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat.java
License:Apache License
/** * create the final output file//from www. j a va2 s. c o m * * @param jc the job configuration * @param fileOutputPath the file that the output should be directed at * @param valueClass the value class used to create * @param tableProperties the tableInfo for this file's corresponding table * @return RecordWriter for the output file */ @Override public RecordWriter getHiveRecordWriter(final JobConf jc, final Path fileOutputPath, final Class<? extends Writable> valueClass, final boolean isCompressed, final Properties tableProperties, final Progressable progress) throws IOException { LOG.info("Output going into " + fileOutputPath); FileSystem fs = fileOutputPath.getFileSystem(jc); FSDataOutputStream outFile = fs.create(fileOutputPath); FSDataOutputStream splitFile = null; if (MongoConfigUtil.getBSONOutputBuildSplits(jc)) { Path splitPath = new Path(fileOutputPath.getParent(), "." + fileOutputPath.getName() + ".splits"); splitFile = fs.create(splitPath); } long splitSize = BSONSplitter.getSplitSize(jc, null); return new HiveBSONFileRecordWriter(outFile, splitFile, splitSize); }
From source file:com.mongodb.hadoop.mapred.BSONFileOutputFormat.java
License:Apache License
public RecordWriter<K, V> getRecordWriter(final FileSystem ignored, final JobConf job, final String name, final Progressable progress) throws IOException { Path outPath = getDefaultWorkFile(job, name, ".bson"); LOG.info("output going into " + outPath); FileSystem fs = outPath.getFileSystem(job); FSDataOutputStream outFile = fs.create(outPath); FSDataOutputStream splitFile = null; if (MongoConfigUtil.getBSONOutputBuildSplits(job)) { Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits"); splitFile = fs.create(splitPath); }// w w w .j ava 2 s .c o m long splitSize = BSONSplitter.getSplitSize(job, null); return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize); }
From source file:com.moz.fiji.mapreduce.DistributedCacheJars.java
License:Apache License
/** * Removes files whose name are duplicated in a given collection. * * @param jarFiles Collection of .jar files to de-duplicate. * @return De-duplicated collection of .jar files. *///from ww w.j av a2s. c om public static List<Path> deDuplicateFilenames(Iterable<Path> jarFiles) { final Set<String> jarFileNames = Sets.newHashSet(); final List<Path> uniqueFiles = Lists.newArrayList(); for (Path jarFile : jarFiles) { if (jarFileNames.add(jarFile.getName())) { uniqueFiles.add(jarFile); } } return uniqueFiles; }
From source file:com.moz.fiji.mapreduce.lib.examples.News20BulkImporter.java
License:Apache License
/** * Reads a single news article, and writes its contents to a new fiji row, * indexed by the article's name (A string consisting of the parent folder, and * this article's hash), and the a priori categorization of this article. * * @param key The fully qualified path to the current file we're reading. * @param value The raw data to insert into this column. * @param context The context to write to. * @throws IOException if there is an error. *///from w w w . j a v a2 s . com @Override public void produce(Text key, Text value, FijiTableContext context) throws IOException { Path qualifiedPath = new Path(key.toString()); // Category is specified on the containing folder. String category = qualifiedPath.getParent().getName(); // Name is the concatenation of category and file name. String name = category + "." + qualifiedPath.getName(); // write name, category, and raw article. EntityId entity = context.getEntityId(name); context.put(entity, FAMILY, ARTICLE_NAME_QUALIFIER, name); context.put(entity, FAMILY, CATEGORY_QUALIFIER, category); context.put(entity, FAMILY, RAW_ARTICLE_QUALIFIER, value.toString()); }
From source file:com.moz.fiji.mapreduce.TestDistributedCacheJars.java
License:Apache License
/** * Pre: Requires mTempDir to be set and filled (only) with .jar files. * These don't need to actually be jars. * * Creates a new Job and checks that jars de-dupe. * * @throws IOException if configuration can not be created. *///from w w w . j av a 2 s .c o m @Test public void testJarsDeDupe() throws IOException { final File tempDir = getLocalTempDir(); // Jar list should de-dupe to {"myjar_a, "myjar_b", "myjar_0", "myjar_1"} Set<String> dedupedJarNames = new HashSet<String>(4); dedupedJarNames.add("myjar_a.jar"); dedupedJarNames.add("myjar_b.jar"); dedupedJarNames.add("myjar_0.jar"); dedupedJarNames.add("myjar_1.jar"); Job job = new Job(); List<String> someJars = new ArrayList<String>(); // Some unique jar names. someJars.add("/somepath/myjar_a.jar"); someJars.add("/another/path/myjar_b.jar"); someJars.add("/myjar_0.jar"); // Duplicate jars. someJars.add("/another/path/myjar_b.jar"); someJars.add("/yet/another/path/myjar_b.jar"); job.getConfiguration().set(CONF_TMPJARS, StringUtils.join(someJars, ",")); // Now add some duplicate jars from mTempDir. assertEquals(0, tempDir.list().length); createTestJars(tempDir, "myjar_0.jar", "myjar_1.jar"); assertEquals(2, tempDir.list().length); DistributedCacheJars.addJarsToDistributedCache(job, tempDir); // Confirm each jar appears in de-dupe list exactly once. String listedJars = job.getConfiguration().get(CONF_TMPJARS); String[] jars = listedJars.split(","); for (String jar : jars) { // Check that path terminates in an expected jar. Path p = new Path(jar); assertTrue(dedupedJarNames.contains(p.getName())); dedupedJarNames.remove(p.getName()); } assertEquals(0, dedupedJarNames.size()); }
From source file:com.moz.fiji.schema.mapreduce.DistributedCacheJars.java
License:Apache License
/** * Takes a list of paths and returns a list of paths with unique filenames. * * @param jarList A list of jars to de-dupe. * @return A de-duplicated list of jars. *///www.j a v a2 s . com public static List<String> deDuplicateJarNames(List<String> jarList) { Set<String> jarNames = new HashSet<String>(); List<String> jarPaths = new ArrayList<String>(); for (String jar : jarList) { Path path = new Path(jar); String jarName = path.getName(); if (!jarNames.contains(jarName)) { jarNames.add(jarName); jarPaths.add(jar); } else { LOG.warn("Skipping jar at " + jar + " because " + jarName + " already added."); } } return jarPaths; }
From source file:com.moz.fiji.schema.mapreduce.TestDistributedCacheJars.java
License:Apache License
/** * Pre: Requires mTempDir to be set and filled (only) with .jar files. * These don't need to actually be jars. * * Creates a new Job and checks that jars de-dupe. * * @throws IOException if configuration can not be created. */// ww w . ja v a2 s.com @Test public void testJarsDeDupe() throws IOException { // Jar list should de-dupe to {"myjar_a, "myjar_b", "myjar_0", "myjar_1"} Set<String> dedupedJarNames = new HashSet<String>(4); dedupedJarNames.add("myjar_a.jar"); dedupedJarNames.add("myjar_b.jar"); dedupedJarNames.add("myjar_0.jar"); dedupedJarNames.add("myjar_1.jar"); Job job = new Job(); List<String> someJars = new ArrayList<String>(); // Some unique jar names. someJars.add("/somepath/myjar_a.jar"); someJars.add("/another/path/myjar_b.jar"); someJars.add("/myjar_0.jar"); // Duplicate jars. someJars.add("/another/path/myjar_b.jar"); someJars.add("/yet/another/path/myjar_b.jar"); job.getConfiguration().set(CONF_TMPJARS, StringUtils.join(someJars, ",")); // Now add some duplicate jars from mTempDir. assertEquals(0, mTempDir.getRoot().list().length); createTestJars("myjar_0.jar", "myjar_1.jar"); assertEquals(2, mTempDir.getRoot().list().length); DistributedCacheJars.addJarsToDistributedCache(job, mTempDir.getRoot()); // Confirm each jar appears in de-dupe list exactly once. String listedJars = job.getConfiguration().get(CONF_TMPJARS); String[] jars = listedJars.split(","); for (String jar : jars) { // Check that path terminates in an expected jar. Path p = new Path(jar); assertTrue(dedupedJarNames.contains(p.getName())); dedupedJarNames.remove(p.getName()); } assertEquals(0, dedupedJarNames.size()); }
From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java
License:Apache License
private void loadCentroids() throws IOException { Text k = new Text(); Cluster v = new Cluster(); CoCluster c;//from ww w. java 2s . c o m SequenceFile.Reader currReader = null; try { fs = FileSystem.get(clustersPath.toUri(), conf); for (FileStatus status : fs.listStatus(clustersPath)) { Path p = status.getPath(); if (!status.isDir() && !p.getName().startsWith("_")) { try { currReader = new SequenceFile.Reader(fs, p, conf); while (currReader.next(k, v)) { c = new CoCluster(v.getCenter(), v.getMeasure()); coclusters.put(v.getId(), c); } } finally { IOUtils.closeStream(currReader); } } } } catch (IOException ie) { LOG.error("Error while reading clusters", ie); } finally { if (currReader != null) { IOUtils.closeStream(currReader); } if (fs != null) { fs.close(); } } }
From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java
License:Apache License
private void loadText(Set<String> allTopDocIDs) throws IOException { Map<String, String> currLine = null; String currID;//w ww. jav a2s .c o m String currText; ObjectMapper mapper = new ObjectMapper(); BufferedReader reader = null; try { fs = FileSystem.get(docIDTextMapPath.toUri(), conf); for (FileStatus status : fs.listStatus(docIDTextMapPath)) { Path p = status.getPath(); if (!status.isDir() && !p.getName().startsWith("_")) { try { reader = new BufferedReader(new InputStreamReader(fs.open(status.getPath()))); String line = null; currID = null; while ((line = reader.readLine()) != null) { String[] pair = line.split("\t", 2); currLine = mapper.readValue(pair[1], new TypeReference<Map<String, String>>() { }); if (currLine.containsKey(this.DOC_ID)) { currID = currLine.get(this.DOC_ID); if (allTopDocIDs.contains(currID)) { currText = " "; for (String s : this.DOC_TEXT) { if (currLine.containsKey(s)) { currText += currLine.get(s) + '\t'; } else { LOG.error("Possibly malformed" + "line,doesn't contain" + this.DOC_ID); } } if (currText != " ") { currText = currText.trim(); docIDTextMap.put(currID, currText); } } } else { LOG.error("Possibly malformed line," + " doesn't contain" + this.DOC_ID); } } } finally { if (reader != null) { reader.close(); } } } } } catch (IOException e) { LOG.error("Error reading original text file", e); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { LOG.error("Error closing original text file", e); } } if (fs != null) { fs.close(); } } }
From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java
License:Apache License
private void loadPoints() throws IOException { SequenceFile.Reader currReader = null; IntWritable k = new IntWritable(); CoCluster currCluster;//from w w w .ja v a 2s.c o m int currVID; WeightedVectorWritable wvw = new WeightedVectorWritable(); try { fs = FileSystem.get(clusteredPointsPath.toUri(), conf); for (FileStatus status : fs.listStatus(clusteredPointsPath)) { Path p = status.getPath(); if (!status.isDir() && !p.getName().startsWith("_")) { try { currReader = new SequenceFile.Reader(fs, p, conf); while (currReader.next(k, wvw)) { currCluster = coclusters.get(k.get()); NamedVector v = (NamedVector) wvw.getVector(); currVID = Integer.parseInt(v.getName()); if (docIDMap.containsKey(currVID)) { currCluster.put(v, docIDMap.get(currVID), true); } else if (featureIDMap.containsKey(currVID)) { currCluster.put(v, featureIDMap.get(currVID), false); } else { LOG.error("Key not feature or document!"); } } } finally { if (currReader != null) { IOUtils.closeStream(currReader); } } } } } catch (IOException ie) { LOG.info("Error while reading points", ie); } catch (ClassCastException ce) { LOG.info("NamedVectors possibly not used", ce); } finally { if (currReader != null) { IOUtils.closeStream(currReader); } if (fs != null) { fs.close(); } } }