Example usage for org.apache.hadoop.fs Path SEPARATOR

List of usage examples for org.apache.hadoop.fs Path SEPARATOR

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path SEPARATOR.

Prototype

String SEPARATOR

To view the source code for org.apache.hadoop.fs Path SEPARATOR.

Click Source Link

Document

The directory separator, a slash.

Usage

From source file:org.apache.mahout.text.SequenceFilesFromCsvFilter.java

License:Apache License

@Override
protected void process(FileStatus fst, Path current) throws IOException {
    if (fst.isDir()) {
        fs.listStatus(fst.getPath(), new SequenceFilesFromCsvFilter(conf,
                prefix + Path.SEPARATOR + current.getName(), this.options, writer));
    } else {//from www . j  a  v  a2s. co m
        InputStream in = fs.open(fst.getPath());
        for (CharSequence aFit : new FileLineIterable(in, charset, false)) {
            String[] columns = TAB.split(aFit);
            log.info("key : {}, value : {}", columns[keyColumn], columns[valueColumn]);
            String key = columns[keyColumn];
            String value = columns[valueColumn];
            writer.write(prefix + key, value);
        }
    }
}

From source file:org.apache.mahout.text.SequenceFilesFromDirectoryMapper.java

License:Apache License

public void map(IntWritable key, BytesWritable value, Context context)
        throws IOException, InterruptedException {

    Configuration configuration = context.getConfiguration();
    Path filePath = ((CombineFileSplit) context.getInputSplit()).getPath(key.get());
    String relativeFilePath = HadoopUtil.calcRelativeFilePath(configuration, filePath);

    String filename = this.keyPrefix.length() > 0 ? this.keyPrefix + Path.SEPARATOR + relativeFilePath
            : Path.SEPARATOR + relativeFilePath;

    fileValue.set(value.getBytes(), 0, value.getBytes().length);
    context.write(new Text(filename), fileValue);
}

From source file:org.apache.mahout.text.SequenceFilesFromMailArchivesMapper.java

License:Apache License

protected static String generateKey(String mboxFilename, String prefix, String messageId) {
    return Joiner.on(Path.SEPARATOR).join(Lists.newArrayList(prefix, mboxFilename, messageId).iterator());
}

From source file:org.apache.mahout.text.TestSequenceFilesFromDirectory.java

License:Apache License

private static void checkChunkFiles(Configuration configuration, Path outputDir, String[][] data, String prefix)
        throws IOException {
    FileSystem fs = FileSystem.get(configuration);

    // output exists?
    FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
    assertEquals(1, fileStatuses.length); // only one
    assertEquals("chunk-0", fileStatuses[0].getPath().getName());

    Map<String, String> fileToData = Maps.newHashMap();
    for (String[] aData : data) {
        fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
    }/* w w  w. j a  v a2  s . c  om*/

    // read a chunk to check content
    SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<Text, Text>(fileStatuses[0].getPath(),
            true, configuration);
    try {
        while (iterator.hasNext()) {
            Pair<Text, Text> record = iterator.next();
            String retrievedData = fileToData.get(record.getFirst().toString().trim());
            assertNotNull(retrievedData);
            assertEquals(retrievedData, record.getSecond().toString().trim());
        }
    } finally {
        Closeables.close(iterator, true);
    }
}

From source file:org.apache.mahout.text.TestSequenceFilesFromDirectory.java

License:Apache License

private static void checkRecursiveChunkFiles(Configuration configuration, Path outputDir, String[][] data,
        String prefix) throws IOException {
    FileSystem fs = FileSystem.get(configuration);

    System.out.println(" ----------- check_Recursive_ChunkFiles ------------");

    // output exists?
    FileStatus[] fileStatuses = fs.listStatus(outputDir, PathFilters.logsCRCFilter());
    assertEquals(1, fileStatuses.length); // only one
    assertEquals("chunk-0", fileStatuses[0].getPath().getName());

    Map<String, String> fileToData = Maps.newHashMap();
    String currentPath = prefix;//from  w w w. j  a va 2s.c  o m
    for (String[] aData : data) {
        currentPath += Path.SEPARATOR + aData[0];
        fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
    }

    // read a chunk to check content
    SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<Text, Text>(fileStatuses[0].getPath(),
            true, configuration);
    try {
        while (iterator.hasNext()) {
            Pair<Text, Text> record = iterator.next();
            String retrievedData = fileToData.get(record.getFirst().toString().trim());
            System.out.printf("%s >> %s\n", record.getFirst().toString().trim(),
                    record.getSecond().toString().trim());

            assertNotNull(retrievedData);
            assertEquals(retrievedData, record.getSecond().toString().trim());
            System.out.printf(">>> k: %s, v: %s\n", record.getFirst().toString(),
                    record.getSecond().toString());
        }
    } finally {
        Closeables.close(iterator, true);
    }
}

From source file:org.apache.mahout.text.TestSequenceFilesFromDirectory.java

License:Apache License

private static void checkMRResultFiles(Configuration conf, Path outputDir, String[][] data, String prefix)
        throws IOException {
    FileSystem fs = FileSystem.get(conf);

    // output exists?
    FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
    assertEquals(1, fileStatuses.length); // only one
    assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
    Map<String, String> fileToData = Maps.newHashMap();
    for (String[] aData : data) {
        System.out.printf("map.put: %s %s\n", prefix + Path.SEPARATOR + aData[0], aData[1]);
        fileToData.put(prefix + Path.SEPARATOR + aData[0], aData[1]);
    }/*from w ww  . j  av a 2 s .co  m*/

    // read a chunk to check content
    SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<Text, Text>(fileStatuses[0].getPath(),
            true, conf);
    try {
        while (iterator.hasNext()) {
            Pair<Text, Text> record = iterator.next();
            String retrievedData = fileToData.get(record.getFirst().toString().trim());

            System.out.printf("MR> %s >> %s\n", record.getFirst().toString().trim(),
                    record.getSecond().toString().trim());
            assertNotNull(retrievedData);
            assertEquals(retrievedData, record.getSecond().toString().trim());
        }
    } finally {
        Closeables.close(iterator, true);
    }
}

From source file:org.apache.mahout.text.TestSequenceFilesFromDirectory.java

License:Apache License

private static void checkMRResultFilesRecursive(Configuration configuration, Path outputDir, String[][] data,
        String prefix) throws IOException {
    FileSystem fs = FileSystem.get(configuration);

    // output exists?
    FileStatus[] fileStatuses = fs.listStatus(outputDir.suffix("/part-m-00000"), PathFilters.logsCRCFilter());
    assertEquals(1, fileStatuses.length); // only one
    assertEquals("part-m-00000", fileStatuses[0].getPath().getName());
    Map<String, String> fileToData = Maps.newHashMap();
    String currentPath = prefix;/*from   ww w.j a v  a2 s. c o  m*/

    for (String[] aData : data) {
        currentPath += Path.SEPARATOR + aData[0];
        fileToData.put(currentPath + Path.SEPARATOR + "file.txt", aData[1]);
    }

    // read a chunk to check content
    SequenceFileIterator<Text, Text> iterator = new SequenceFileIterator<Text, Text>(fileStatuses[0].getPath(),
            true, configuration);
    try {
        while (iterator.hasNext()) {
            Pair<Text, Text> record = iterator.next();
            System.out.printf("MR-Recur > Trying to check: %s\n", record.getFirst().toString().trim());
            String retrievedData = fileToData.get(record.getFirst().toString().trim());
            assertNotNull(retrievedData);
            assertEquals(retrievedData, record.getSecond().toString().trim());
        }
    } finally {
        Closeables.close(iterator, true);
    }
}

From source file:org.apache.nifi.processors.hadoop.AbstractHadoopProcessor.java

License:Apache License

/**
 * Returns the relative path of the child that does not include the filename or the root path.
 *
 * @param root/*from   www  .  j av  a 2  s  .c  o  m*/
 *            the path to relativize from
 * @param child
 *            the path to relativize
 * @return the relative path
 */
public static String getPathDifference(final Path root, final Path child) {
    final int depthDiff = child.depth() - root.depth();
    if (depthDiff <= 1) {
        return "".intern();
    }
    String lastRoot = root.getName();
    Path childsParent = child.getParent();
    final StringBuilder builder = new StringBuilder();
    builder.append(childsParent.getName());
    for (int i = (depthDiff - 3); i >= 0; i--) {
        childsParent = childsParent.getParent();
        String name = childsParent.getName();
        if (name.equals(lastRoot) && childsParent.toString().endsWith(root.toString())) {
            break;
        }
        builder.insert(0, Path.SEPARATOR).insert(0, name);
    }
    return builder.toString();
}

From source file:org.apache.nutch.crawl.Crawl.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.out.println("Usage: Crawl <urlDir> -solr <solrURL> [-dir d] [-threads n] [-depth i] [-topN N]");
        return -1;
    }/* ww  w .j ava 2 s .co  m*/
    Path rootUrlDir = null;
    Path dir = new Path("crawl-" + getDate());
    int threads = getConf().getInt("fetcher.threads.fetch", 10);
    int depth = 5;
    long topN = Long.MAX_VALUE;
    String solrUrl = null;

    for (int i = 0; i < args.length; i++) {
        if ("-dir".equals(args[i])) {
            dir = new Path(args[i + 1]);
            i++;
        } else if ("-threads".equals(args[i])) {
            threads = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-depth".equals(args[i])) {
            depth = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-topN".equals(args[i])) {
            topN = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-solr".equals(args[i])) {
            solrUrl = args[i + 1];
            i++;
        } else if (args[i] != null) {
            rootUrlDir = new Path(args[i]);
        }
    }

    JobConf job = new NutchJob(getConf());

    if (solrUrl == null) {
        LOG.warn("solrUrl is not set, indexing will be skipped...");
    }

    FileSystem fs = FileSystem.get(job);

    if (LOG.isInfoEnabled()) {
        LOG.info("crawl started in: " + dir);
        LOG.info("rootUrlDir = " + rootUrlDir);
        LOG.info("threads = " + threads);
        LOG.info("depth = " + depth);
        LOG.info("solrUrl=" + solrUrl);
        if (topN != Long.MAX_VALUE)
            LOG.info("topN = " + topN);
    }

    Path crawlDb = new Path(dir + "/crawldb");
    Path linkDb = new Path(dir + "/linkdb");
    Path segments = new Path(dir + "/segments");
    Path indexes = new Path(dir + "/indexes");
    Path index = new Path(dir + "/index");

    Path tmpDir = job.getLocalPath("crawl" + Path.SEPARATOR + getDate());
    Injector injector = new Injector(getConf());
    Generator generator = new Generator(getConf());
    Fetcher fetcher = new Fetcher(getConf());
    ParseSegment parseSegment = new ParseSegment(getConf());
    CrawlDb crawlDbTool = new CrawlDb(getConf());
    LinkDb linkDbTool = new LinkDb(getConf());

    // initialize crawlDb
    injector.inject(crawlDb, rootUrlDir);
    int i;
    for (i = 0; i < depth; i++) { // generate new segment
        Path[] segs = generator.generate(crawlDb, segments, -1, topN, System.currentTimeMillis());
        if (segs == null) {
            LOG.info("Stopping at depth=" + i + " - no more URLs to fetch.");
            break;
        }
        fetcher.fetch(segs[0], threads); // fetch it
        if (!Fetcher.isParsing(job)) {
            parseSegment.parse(segs[0]); // parse it, if needed
        }
        crawlDbTool.update(crawlDb, segs, true, true); // update crawldb
    }
    if (i > 0) {
        linkDbTool.invert(linkDb, segments, true, true, false); // invert links

        if (solrUrl != null) {
            // index, dedup & merge
            FileStatus[] fstats = fs.listStatus(segments, HadoopFSUtil.getPassDirectoriesFilter(fs));
            SolrIndexer indexer = new SolrIndexer(getConf());
            indexer.indexSolr(solrUrl, crawlDb, linkDb, Arrays.asList(HadoopFSUtil.getPaths(fstats)));
            SolrDeleteDuplicates dedup = new SolrDeleteDuplicates();
            dedup.setConf(getConf());
            dedup.dedup(solrUrl);
        }

    } else {
        LOG.warn("No URLs to fetch - check your seed list and URL filters.");
    }
    if (LOG.isInfoEnabled()) {
        LOG.info("crawl finished: " + dir);
    }
    return 0;
}

From source file:org.apache.oozie.service.TestHAShareLibService.java

License:Apache License

protected void setUp() throws Exception {
    super.setUp();
    container = new EmbeddedServletContainer("oozie");
    container.addServletEndpoint("/v2/admin/*", V2AdminServlet.class);
    container.addServletEndpoint("/other-oozie-server/*", DummyV2AdminServlet.class);
    container.addFilter("*", HostnameFilter.class);
    container.start();//ww w. ja v  a  2  s.c o  m
    Services.get().setService(ShareLibService.class);
    Services.get().getConf().setBoolean(AuthorizationService.CONF_SECURITY_ENABLED, false);

    Services.get().setService(ZKJobsConcurrencyService.class);

    Path launcherlibPath = Services.get().get(WorkflowAppService.class).getSystemLibPath();
    HadoopAccessorService has = Services.get().get(HadoopAccessorService.class);
    URI uri = launcherlibPath.toUri();
    fs = FileSystem.get(has.createJobConf(uri.getAuthority()));
    Date time = new Date(System.currentTimeMillis());

    Path basePath = new Path(Services.get().getConf().get(WorkflowAppService.SYSTEM_LIB_PATH));
    Path libpath = new Path(basePath,
            ShareLibService.SHARE_LIB_PREFIX + ShareLibService.dateFormat.format(time));
    fs.mkdirs(libpath);

    Path pigPath = new Path(libpath.toString() + Path.SEPARATOR + "pig");
    Path pigPath1 = new Path(libpath.toString() + Path.SEPARATOR + "pig_9");
    Path pigPath2 = new Path(libpath.toString() + Path.SEPARATOR + "pig_10");
    fs.mkdirs(pigPath);
    fs.mkdirs(pigPath1);
    fs.mkdirs(pigPath2);
    fs.create(new Path(libpath.toString() + Path.SEPARATOR + "pig_10" + Path.SEPARATOR + "pig-10.jar")).close();

}