Example usage for org.apache.hadoop.fs Path getName

List of usage examples for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName() 

Source Link

Document

Returns the final component of this path.

Usage

From source file:com.mongodb.hadoop.hive.output.HiveBSONFileOutputFormat.java

License:Apache License

/**
 * create the final output file//from   www.  j  a va2 s. c o  m
 *
 * @param jc              the job configuration
 * @param fileOutputPath  the file that the output should be directed at
 * @param valueClass      the value class used to create
 * @param tableProperties the tableInfo for this file's corresponding table
 * @return RecordWriter for the output file
 */
@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path fileOutputPath,
        final Class<? extends Writable> valueClass, final boolean isCompressed,
        final Properties tableProperties, final Progressable progress) throws IOException {

    LOG.info("Output going into " + fileOutputPath);

    FileSystem fs = fileOutputPath.getFileSystem(jc);
    FSDataOutputStream outFile = fs.create(fileOutputPath);

    FSDataOutputStream splitFile = null;
    if (MongoConfigUtil.getBSONOutputBuildSplits(jc)) {
        Path splitPath = new Path(fileOutputPath.getParent(), "." + fileOutputPath.getName() + ".splits");
        splitFile = fs.create(splitPath);
    }

    long splitSize = BSONSplitter.getSplitSize(jc, null);

    return new HiveBSONFileRecordWriter(outFile, splitFile, splitSize);
}

From source file:com.mongodb.hadoop.mapred.BSONFileOutputFormat.java

License:Apache License

public RecordWriter<K, V> getRecordWriter(final FileSystem ignored, final JobConf job, final String name,
        final Progressable progress) throws IOException {
    Path outPath = getDefaultWorkFile(job, name, ".bson");
    LOG.info("output going into " + outPath);

    FileSystem fs = outPath.getFileSystem(job);
    FSDataOutputStream outFile = fs.create(outPath);

    FSDataOutputStream splitFile = null;
    if (MongoConfigUtil.getBSONOutputBuildSplits(job)) {
        Path splitPath = new Path(outPath.getParent(), "." + outPath.getName() + ".splits");
        splitFile = fs.create(splitPath);
    }//  w w w  .j  ava  2 s .c  o  m

    long splitSize = BSONSplitter.getSplitSize(job, null);

    return new BSONFileRecordWriter<K, V>(outFile, splitFile, splitSize);
}

From source file:com.moz.fiji.mapreduce.DistributedCacheJars.java

License:Apache License

/**
 * Removes files whose name are duplicated in a given collection.
 *
 * @param jarFiles Collection of .jar files to de-duplicate.
 * @return De-duplicated collection of .jar files.
 *///from ww w.j  av  a2s. c om
public static List<Path> deDuplicateFilenames(Iterable<Path> jarFiles) {
    final Set<String> jarFileNames = Sets.newHashSet();
    final List<Path> uniqueFiles = Lists.newArrayList();
    for (Path jarFile : jarFiles) {
        if (jarFileNames.add(jarFile.getName())) {
            uniqueFiles.add(jarFile);
        }
    }
    return uniqueFiles;
}

From source file:com.moz.fiji.mapreduce.lib.examples.News20BulkImporter.java

License:Apache License

/**
 * Reads a single news article, and writes its contents to a new fiji row,
 * indexed by the article's name (A string consisting of the parent folder, and
 * this article's hash), and the a priori categorization of this article.
 *
 * @param key The fully qualified path to the current file we're reading.
 * @param value The raw data to insert into this column.
 * @param context The context to write to.
 * @throws IOException if there is an error.
 *///from w w  w  .  j  a  v a2  s . com
@Override
public void produce(Text key, Text value, FijiTableContext context) throws IOException {
    Path qualifiedPath = new Path(key.toString());

    // Category is specified on the containing folder.
    String category = qualifiedPath.getParent().getName();
    // Name is the concatenation of category and file name.
    String name = category + "." + qualifiedPath.getName();

    // write name, category, and raw article.
    EntityId entity = context.getEntityId(name);
    context.put(entity, FAMILY, ARTICLE_NAME_QUALIFIER, name);
    context.put(entity, FAMILY, CATEGORY_QUALIFIER, category);
    context.put(entity, FAMILY, RAW_ARTICLE_QUALIFIER, value.toString());
}

From source file:com.moz.fiji.mapreduce.TestDistributedCacheJars.java

License:Apache License

/**
 * Pre: Requires mTempDir to be set and filled (only) with .jar files.
 * These don't need to actually be jars.
 *
 * Creates a new Job and checks that jars de-dupe.
 *
 * @throws IOException if configuration can not be created.
 *///from w w  w  . j av  a  2 s .c o  m
@Test
public void testJarsDeDupe() throws IOException {
    final File tempDir = getLocalTempDir();

    // Jar list should de-dupe to {"myjar_a, "myjar_b", "myjar_0", "myjar_1"}
    Set<String> dedupedJarNames = new HashSet<String>(4);
    dedupedJarNames.add("myjar_a.jar");
    dedupedJarNames.add("myjar_b.jar");
    dedupedJarNames.add("myjar_0.jar");
    dedupedJarNames.add("myjar_1.jar");

    Job job = new Job();

    List<String> someJars = new ArrayList<String>();
    // Some unique jar names.
    someJars.add("/somepath/myjar_a.jar");
    someJars.add("/another/path/myjar_b.jar");
    someJars.add("/myjar_0.jar");

    // Duplicate jars.
    someJars.add("/another/path/myjar_b.jar");
    someJars.add("/yet/another/path/myjar_b.jar");

    job.getConfiguration().set(CONF_TMPJARS, StringUtils.join(someJars, ","));

    // Now add some duplicate jars from mTempDir.
    assertEquals(0, tempDir.list().length);
    createTestJars(tempDir, "myjar_0.jar", "myjar_1.jar");
    assertEquals(2, tempDir.list().length);
    DistributedCacheJars.addJarsToDistributedCache(job, tempDir);

    // Confirm each jar appears in de-dupe list exactly once.
    String listedJars = job.getConfiguration().get(CONF_TMPJARS);
    String[] jars = listedJars.split(",");
    for (String jar : jars) {
        // Check that path terminates in an expected jar.
        Path p = new Path(jar);
        assertTrue(dedupedJarNames.contains(p.getName()));
        dedupedJarNames.remove(p.getName());
    }
    assertEquals(0, dedupedJarNames.size());
}

From source file:com.moz.fiji.schema.mapreduce.DistributedCacheJars.java

License:Apache License

/**
 * Takes a list of paths and returns a list of paths with unique filenames.
 *
 * @param jarList A list of jars to de-dupe.
 * @return A de-duplicated list of jars.
 *///www.j a  v a2 s .  com
public static List<String> deDuplicateJarNames(List<String> jarList) {
    Set<String> jarNames = new HashSet<String>();
    List<String> jarPaths = new ArrayList<String>();
    for (String jar : jarList) {
        Path path = new Path(jar);
        String jarName = path.getName();
        if (!jarNames.contains(jarName)) {
            jarNames.add(jarName);
            jarPaths.add(jar);
        } else {
            LOG.warn("Skipping jar at " + jar + " because " + jarName + " already added.");
        }
    }
    return jarPaths;
}

From source file:com.moz.fiji.schema.mapreduce.TestDistributedCacheJars.java

License:Apache License

/**
 * Pre: Requires mTempDir to be set and filled (only) with .jar files.
 * These don't need to actually be jars.
 *
 * Creates a new Job and checks that jars de-dupe.
 *
 * @throws IOException if configuration can not be created.
 *///  ww w  . ja v a2 s.com
@Test
public void testJarsDeDupe() throws IOException {
    // Jar list should de-dupe to {"myjar_a, "myjar_b", "myjar_0", "myjar_1"}
    Set<String> dedupedJarNames = new HashSet<String>(4);
    dedupedJarNames.add("myjar_a.jar");
    dedupedJarNames.add("myjar_b.jar");
    dedupedJarNames.add("myjar_0.jar");
    dedupedJarNames.add("myjar_1.jar");

    Job job = new Job();

    List<String> someJars = new ArrayList<String>();
    // Some unique jar names.
    someJars.add("/somepath/myjar_a.jar");
    someJars.add("/another/path/myjar_b.jar");
    someJars.add("/myjar_0.jar");

    // Duplicate jars.
    someJars.add("/another/path/myjar_b.jar");
    someJars.add("/yet/another/path/myjar_b.jar");

    job.getConfiguration().set(CONF_TMPJARS, StringUtils.join(someJars, ","));

    // Now add some duplicate jars from mTempDir.
    assertEquals(0, mTempDir.getRoot().list().length);
    createTestJars("myjar_0.jar", "myjar_1.jar");
    assertEquals(2, mTempDir.getRoot().list().length);
    DistributedCacheJars.addJarsToDistributedCache(job, mTempDir.getRoot());

    // Confirm each jar appears in de-dupe list exactly once.
    String listedJars = job.getConfiguration().get(CONF_TMPJARS);
    String[] jars = listedJars.split(",");
    for (String jar : jars) {
        // Check that path terminates in an expected jar.
        Path p = new Path(jar);
        assertTrue(dedupedJarNames.contains(p.getName()));
        dedupedJarNames.remove(p.getName());
    }
    assertEquals(0, dedupedJarNames.size());
}

From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java

License:Apache License

private void loadCentroids() throws IOException {
    Text k = new Text();
    Cluster v = new Cluster();
    CoCluster c;//from   ww  w. java 2s  . c o  m
    SequenceFile.Reader currReader = null;
    try {
        fs = FileSystem.get(clustersPath.toUri(), conf);
        for (FileStatus status : fs.listStatus(clustersPath)) {
            Path p = status.getPath();
            if (!status.isDir() && !p.getName().startsWith("_")) {
                try {
                    currReader = new SequenceFile.Reader(fs, p, conf);
                    while (currReader.next(k, v)) {
                        c = new CoCluster(v.getCenter(), v.getMeasure());
                        coclusters.put(v.getId(), c);
                    }
                } finally {
                    IOUtils.closeStream(currReader);
                }
            }
        }
    } catch (IOException ie) {
        LOG.error("Error while reading clusters", ie);

    } finally {
        if (currReader != null) {
            IOUtils.closeStream(currReader);
        }
        if (fs != null) {
            fs.close();
        }
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java

License:Apache License

private void loadText(Set<String> allTopDocIDs) throws IOException {
    Map<String, String> currLine = null;
    String currID;//w  ww. jav  a2s .c  o m
    String currText;
    ObjectMapper mapper = new ObjectMapper();
    BufferedReader reader = null;
    try {
        fs = FileSystem.get(docIDTextMapPath.toUri(), conf);
        for (FileStatus status : fs.listStatus(docIDTextMapPath)) {
            Path p = status.getPath();
            if (!status.isDir() && !p.getName().startsWith("_")) {
                try {
                    reader = new BufferedReader(new InputStreamReader(fs.open(status.getPath())));
                    String line = null;
                    currID = null;
                    while ((line = reader.readLine()) != null) {
                        String[] pair = line.split("\t", 2);
                        currLine = mapper.readValue(pair[1], new TypeReference<Map<String, String>>() {
                        });
                        if (currLine.containsKey(this.DOC_ID)) {
                            currID = currLine.get(this.DOC_ID);
                            if (allTopDocIDs.contains(currID)) {
                                currText = " ";
                                for (String s : this.DOC_TEXT) {
                                    if (currLine.containsKey(s)) {
                                        currText += currLine.get(s) + '\t';
                                    } else {
                                        LOG.error("Possibly malformed" + "line,doesn't contain" + this.DOC_ID);
                                    }
                                }
                                if (currText != " ") {
                                    currText = currText.trim();
                                    docIDTextMap.put(currID, currText);
                                }
                            }

                        } else {
                            LOG.error("Possibly malformed line," + " doesn't contain" + this.DOC_ID);
                        }
                    }
                } finally {
                    if (reader != null) {
                        reader.close();
                    }
                }
            }
        }
    } catch (IOException e) {
        LOG.error("Error reading original text file", e);
    } finally {
        if (reader != null) {
            try {
                reader.close();
            } catch (IOException e) {
                LOG.error("Error closing original text file", e);
            }
        }
        if (fs != null) {
            fs.close();
        }
    }
}

From source file:com.mozilla.grouperfish.transforms.coclustering.display.WriteCoClusteringOutput.java

License:Apache License

private void loadPoints() throws IOException {
    SequenceFile.Reader currReader = null;
    IntWritable k = new IntWritable();
    CoCluster currCluster;//from w  w w .ja  v a  2s.c  o m
    int currVID;
    WeightedVectorWritable wvw = new WeightedVectorWritable();
    try {
        fs = FileSystem.get(clusteredPointsPath.toUri(), conf);
        for (FileStatus status : fs.listStatus(clusteredPointsPath)) {
            Path p = status.getPath();
            if (!status.isDir() && !p.getName().startsWith("_")) {
                try {
                    currReader = new SequenceFile.Reader(fs, p, conf);
                    while (currReader.next(k, wvw)) {
                        currCluster = coclusters.get(k.get());
                        NamedVector v = (NamedVector) wvw.getVector();
                        currVID = Integer.parseInt(v.getName());
                        if (docIDMap.containsKey(currVID)) {
                            currCluster.put(v, docIDMap.get(currVID), true);
                        } else if (featureIDMap.containsKey(currVID)) {
                            currCluster.put(v, featureIDMap.get(currVID), false);
                        } else {
                            LOG.error("Key not feature or document!");
                        }
                    }
                } finally {
                    if (currReader != null) {
                        IOUtils.closeStream(currReader);
                    }
                }
            }
        }
    } catch (IOException ie) {
        LOG.info("Error while reading points", ie);
    } catch (ClassCastException ce) {
        LOG.info("NamedVectors possibly not used", ce);
    } finally {
        if (currReader != null) {
            IOUtils.closeStream(currReader);
        }
        if (fs != null) {
            fs.close();
        }
    }
}