Example usage for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString()

Source Link

Usage

From source file:bixo.examples.SimpleCrawlWorkflow.java

License:Open Source License

public static Flow createFlow(Path curWorkingDirPath, Path crawlDbPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, BaseUrlFilter urlFilter, SimpleCrawlToolOptions options) throws Throwable {
    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    int numReducers = conf.getNumReduceTasks() * HadoopUtils.getTaskTrackers(conf);
    Properties props = HadoopUtils.getDefaultProperties(SimpleCrawlWorkflow.class, options.isDebugLogging(),
            conf);/*from  ww w  .jav  a2s.co m*/
    FileSystem fs = curWorkingDirPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }

    // Our crawl db is defined by the CrawlDbDatum
    Tap inputSource = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedCrawlDatums());

    Pipe finishedDatumsFromDb = splitter.getRHSPipe();
    Pipe urlsToFetchPipe = new Pipe("urls to Fetch", splitter.getLHSPipe());

    // Convert the urlsToFetchPipe so that we now deal with UrlDatums.
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbFunction());
    // A TupleLogger is a good way to follow the tuples around in a flow. You can enable the output
    // of tuples by setting options.setDebugLogging() to true.
    urlsToFetchPipe = TupleLogger.makePipe(urlsToFetchPipe, true);

    // Create the output sinks :
    //      crawldb
    //      content
    //      parse
    //      status
    Path outCrawlDbPath = new Path(curWorkingDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap loopCrawldbSink = new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), outCrawlDbPath.toString());

    Path contentDirPath = new Path(curWorkingDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    Path parseDirPath = new Path(curWorkingDirPath, CrawlConfig.PARSE_SUBDIR_NAME);
    Tap parseSink = new Hfs(new SequenceFile(ParsedDatum.FIELDS), parseDirPath.toString());

    Path statusDirPath = new Path(curWorkingDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());

    // Create the sub-assembly that runs the fetch job
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(options.getMaxThreads(), fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    // You can also provide a set of mime types you want to restrict what content type you 
    // want to deal with - for now keep it simple.
    Set<String> validMimeTypes = new HashSet<String>();
    validMimeTypes.add("text/plain");
    validMimeTypes.add("text/html");
    fetcherPolicy.setValidMimeTypes(validMimeTypes);

    // The scorer is used by the FetchPipe to assign a score to every URL that passes the 
    // robots.txt processing. The score is used to sort URLs such that higher scoring URLs
    // are fetched first. If URLs are skipped for any reason(s) lower scoring URLs are skipped.
    BaseScoreGenerator scorer = new FixedScoreGenerator();

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Take content and split it into content output plus parse to extract URLs.
    SimpleParser parser = new SimpleParser();
    parser.setExtractLanguage(false);
    ParsePipe parsePipe = new ParsePipe(contentPipe, parser);

    Pipe urlFromOutlinksPipe = new Pipe("url from outlinks", parsePipe.getTailPipe());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new CreateUrlDatumFromOutlinksFunction());
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new UrlFilter(urlFilter));
    urlFromOutlinksPipe = new Each(urlFromOutlinksPipe, new NormalizeUrlFunction(new SimpleUrlNormalizer()));
    urlFromOutlinksPipe = TupleLogger.makePipe(urlFromOutlinksPipe, true);

    // Take status and output urls from it  
    Pipe urlFromFetchPipe = new Pipe("url from fetch");
    urlFromFetchPipe = new Each(statusPipe, new CreateUrlDatumFromStatusFunction());
    urlFromFetchPipe = TupleLogger.makePipe(urlFromFetchPipe, true);

    // Finally join the URLs we get from parsing content with the URLs we got
    // from the status ouput, and the urls we didn't process from the db so that 
    // we have a unified stream of all known URLs for the crawldb.
    Pipe finishedUrlsFromDbPipe = new Each(finishedDatumsFromDb, new CreateUrlDatumFromCrawlDbFunction());
    finishedUrlsFromDbPipe = TupleLogger.makePipe(finishedUrlsFromDbPipe, true);

    // NOTE : Ideally you would just do a CoGroup instead of converting all the pipes to emit UrlDatums 
    // and then doing the extra step of converting from UrlDatum to CrawlDbDatum.
    // The reason this isn't being done here is because we are sharing LatestUrlDatumBuffer() with JDBCCrawlTool
    Pipe crawlDbPipe = new GroupBy("crawldb pipe",
            Pipe.pipes(urlFromFetchPipe, urlFromOutlinksPipe, finishedUrlsFromDbPipe),
            new Fields(UrlDatum.URL_FN));
    crawlDbPipe = new Every(crawlDbPipe, new LatestUrlDatumBuffer(), Fields.RESULTS);

    Pipe outputPipe = new Pipe("output pipe");
    outputPipe = new Each(crawlDbPipe, new CreateCrawlDbDatumFromUrlFunction());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(ParsePipe.PARSE_PIPE_NAME, parseSink);
    sinkMap.put(crawlDbPipe.getName(), loopCrawldbSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, statusPipe, contentPipe, parsePipe.getTailPipe(),
            outputPipe);

    return flow;
}

From source file:bixo.examples.webmining.DemoWebMiningWorkflow.java

License:Apache License

public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException {

    SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
    JobConf defaultJobConf = HadoopUtils.getDefaultJobConf();

    InputStream is = null;// w w w . j  a  v  a  2 s . c om
    TupleEntryCollector writer = null;
    try {
        Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true);
        writer = urlSink.openForWrite(defaultJobConf);

        is = DemoWebMiningWorkflow.class.getResourceAsStream(fileName);
        if (is == null) {
            throw new FileNotFoundException("The seed urls file doesn't exist");
        }

        List<String> lines = IOUtils.readLines(is);
        for (String line : lines) {
            line = line.trim();
            if (line.startsWith("#")) {
                continue;
            }

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f,
                    0.0f);
            writer.add(datum.getTuple());
        }

        writer.close();
    } catch (IOException e) {
        HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath);
        throw e;
    } finally {
        IoUtils.safeClose(is);
        if (writer != null) {
            writer.close();
        }
    }

}

From source file:bixo.examples.webmining.DemoWebMiningWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, DemoWebMiningOptions options) throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = HadoopUtils.getNumReducers(conf);
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(DemoWebMiningWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }/*from   w  ww . j a  va  2  s. c  o  m*/

    Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES),
            crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    BaseScoreGenerator scorer = new LinkScoreGenerator();

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString());
    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());
    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}

From source file:bixo.examples.webmining.WebMiningWorkflow.java

License:Apache License

public static void importSeedUrls(Path crawlDbPath, String fileName) throws IOException, InterruptedException {

    SimpleUrlNormalizer normalizer = new SimpleUrlNormalizer();
    JobConf defaultJobConf = HadoopUtils.getDefaultJobConf();

    InputStream is = null;// w ww .  j a  v a 2s. com
    TupleEntryCollector writer = null;
    try {
        Tap urlSink = new Hfs(new TextLine(), crawlDbPath.toString(), true);
        writer = urlSink.openForWrite(defaultJobConf);

        is = WebMiningWorkflow.class.getResourceAsStream(fileName);
        if (is == null) {
            throw new FileNotFoundException("The seed urls file doesn't exist");
        }

        List<String> lines = IOUtils.readLines(is);
        for (String line : lines) {
            line = line.trim();
            if (line.startsWith("#")) {
                continue;
            }

            CrawlDbDatum datum = new CrawlDbDatum(normalizer.normalize(line), 0, UrlStatus.UNFETCHED, 0.0f,
                    0.0f);
            writer.add(datum.getTuple());
        }

        writer.close();
    } catch (IOException e) {
        HadoopUtils.safeRemove(crawlDbPath.getFileSystem(defaultJobConf), crawlDbPath);
        throw e;
    } finally {
        IoUtils.safeClose(is);
        if (writer != null) {
            writer.close();
        }
    }

}

From source file:bixo.examples.webmining.WebMiningWorkflow.java

License:Apache License

public static Flow createWebMiningWorkflow(Path crawlDbPath, Path curLoopDirPath, FetcherPolicy fetcherPolicy,
        UserAgent userAgent, WebMiningOptions options, boolean resetSolr)
        throws IOException, InterruptedException {

    // Fetch at most 200 pages, max size of 128K, complete mode, from the current dir.
    // HTML only.

    // We want to extract the cleaned up HTML, and pass that to the parser, which will
    // be specified via options.getAnalyzer. From this we'll get outlinks, page score, and
    // any results.

    JobConf conf = HadoopUtils.getDefaultJobConf(CrawlConfig.CRAWL_STACKSIZE_KB);
    boolean isLocal = HadoopUtils.isJobLocal(conf);
    int numReducers = 1; // we always want to use a single reducer, to avoid contention
    conf.setNumReduceTasks(numReducers);
    conf.setInt("mapred.min.split.size", 64 * 1024 * 1024);
    Properties props = HadoopUtils.getDefaultProperties(WebMiningWorkflow.class, false, conf);
    FileSystem fs = crawlDbPath.getFileSystem(conf);

    // Input : the crawldb
    if (!fs.exists(crawlDbPath)) {
        throw new RuntimeException("CrawlDb not found");
    }//  w ww.jav  a2s . c  o m

    Tap inputSource = new Hfs(new TextDelimited(CrawlDbDatum.FIELDS, "\t", CrawlDbDatum.TYPES),
            crawlDbPath.toString());
    Pipe importPipe = new Pipe("import pipe");

    // Split into tuples that are to be fetched and that have already been fetched
    SplitterAssembly splitter = new SplitterAssembly(importPipe, new SplitFetchedUnfetchedSSCrawlDatums());

    Pipe finishedDatumsFromDb = new Pipe("finished datums from db", splitter.getRHSPipe());
    Pipe urlsToFetchPipe = splitter.getLHSPipe();

    // Limit to MAX_DISTRIBUTED_FETCH if running in real cluster, 
    // or MAX_LOCAL_FETCH if running locally. So first we sort the entries 
    // from high to low by links score.
    // TODO add unit test
    urlsToFetchPipe = new GroupBy(urlsToFetchPipe, new Fields(CrawlDbDatum.LINKS_SCORE_FIELD), true);
    long maxToFetch = HadoopUtils.isJobLocal(conf) ? MAX_LOCAL_FETCH : MAX_DISTRIBUTED_FETCH;
    urlsToFetchPipe = new Each(urlsToFetchPipe, new CreateUrlDatumFromCrawlDbDatum(maxToFetch));

    BaseScoreGenerator scorer = new LinkScoreGenerator();

    // Create the sub-assembly that runs the fetch job
    int maxThreads = isLocal ? CrawlConfig.DEFAULT_NUM_THREADS_LOCAL : CrawlConfig.DEFAULT_NUM_THREADS_CLUSTER;
    SimpleHttpFetcher fetcher = new SimpleHttpFetcher(maxThreads, fetcherPolicy, userAgent);
    fetcher.setMaxRetryCount(CrawlConfig.MAX_RETRIES);
    fetcher.setSocketTimeout(CrawlConfig.SOCKET_TIMEOUT);
    fetcher.setConnectionTimeout(CrawlConfig.CONNECTION_TIMEOUT);

    FetchPipe fetchPipe = new FetchPipe(urlsToFetchPipe, scorer, fetcher, numReducers);
    Pipe statusPipe = new Pipe("status pipe", fetchPipe.getStatusTailPipe());
    Pipe contentPipe = new Pipe("content pipe", fetchPipe.getContentTailPipe());
    contentPipe = TupleLogger.makePipe(contentPipe, true);

    // Create a parser that returns back the raw HTML (cleaned up by Tika) as the parsed content.
    SimpleParser parser = new SimpleParser(new ParserPolicy(), true);
    ParsePipe parsePipe = new ParsePipe(fetchPipe.getContentTailPipe(), parser);

    Pipe analyzerPipe = new Pipe("analyzer pipe");
    analyzerPipe = new Each(parsePipe.getTailPipe(), new AnalyzeHtml());

    Pipe outlinksPipe = new Pipe("outlinks pipe", analyzerPipe);
    outlinksPipe = new Each(outlinksPipe, new CreateLinkDatumFromOutlinksFunction());

    Pipe resultsPipe = new Pipe("results pipe", analyzerPipe);
    resultsPipe = new Each(resultsPipe, new CreateResultsFunction());

    // Group the finished datums, the skipped datums, status, outlinks
    Pipe updatePipe = new CoGroup("update pipe",
            Pipe.pipes(finishedDatumsFromDb, statusPipe, analyzerPipe, outlinksPipe),
            Fields.fields(new Fields(CrawlDbDatum.URL_FIELD), new Fields(StatusDatum.URL_FN),
                    new Fields(AnalyzedDatum.URL_FIELD), new Fields(LinkDatum.URL_FN)),
            null, new OuterJoin());
    updatePipe = new Every(updatePipe, new UpdateCrawlDbBuffer(), Fields.RESULTS);

    // output : loop dir specific crawldb
    Path outCrawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
    Tap crawlDbSink = new Hfs(new TextLine(), outCrawlDbPath.toString());
    // Status, 
    Path statusDirPath = new Path(curLoopDirPath, CrawlConfig.STATUS_SUBDIR_NAME);
    Tap statusSink = new Hfs(new TextLine(), statusDirPath.toString());
    // Content
    Path contentDirPath = new Path(curLoopDirPath, CrawlConfig.CONTENT_SUBDIR_NAME);
    Tap contentSink = new Hfs(new SequenceFile(FetchedDatum.FIELDS), contentDirPath.toString());

    // PageResults
    Path resultsDirPath = new Path(curLoopDirPath, CrawlConfig.RESULTS_SUBDIR_NAME);
    Tap resultsSink = new Hfs(new TextLine(), resultsDirPath.toString());

    // Create the output map that connects each tail pipe to the appropriate sink.
    Map<String, Tap> sinkMap = new HashMap<String, Tap>();
    sinkMap.put(updatePipe.getName(), crawlDbSink);
    sinkMap.put(statusPipe.getName(), statusSink);
    sinkMap.put(contentPipe.getName(), contentSink);
    sinkMap.put(resultsPipe.getName(), resultsSink);

    FlowConnector flowConnector = new FlowConnector(props);
    Flow flow = flowConnector.connect(inputSource, sinkMap, updatePipe, statusPipe, contentPipe, resultsPipe);

    return flow;
}

From source file:bixo.utils.CrawlDirUtilsTest.java

License:Apache License

@Test
public void testMakeLoopDir() throws IOException {
    Path loopPath = CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 3);
    Assert.assertTrue(loopPath.toString().startsWith(_outputPath.toString() + "/3-"));
    Assert.assertTrue(_fileSystem.exists(loopPath));
}

From source file:bixo.utils.CrawlDirUtilsTest.java

License:Apache License

@Test
public void testFindLatestLoopDir() throws IOException {
    CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 0);
    CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 1);
    CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 3);
    CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 7);
    Path expectedPath = CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 11).makeQualified(_fileSystem);
    Assert.assertEquals(expectedPath.toString(),
            CrawlDirUtils.findLatestLoopDir(_fileSystem, _outputPath).makeQualified(_fileSystem).toString());
}

From source file:bixo.utils.CrawlDirUtilsTest.java

License:Apache License

@Test
public void testFindNextLoopDir() throws IOException {
    CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 0);
    Path path1 = CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 1).makeQualified(_fileSystem);
    Path path3 = CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 3).makeQualified(_fileSystem);
    Path path7 = CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 7).makeQualified(_fileSystem);
    CrawlDirUtils.makeLoopDir(_fileSystem, _outputPath, 11);
    Assert.assertEquals(path1.toString(),
            CrawlDirUtils.findNextLoopDir(_fileSystem, _outputPath, 0).makeQualified(_fileSystem).toString());
    Assert.assertEquals(path3.toString(),
            CrawlDirUtils.findNextLoopDir(_fileSystem, _outputPath, 1).makeQualified(_fileSystem).toString());
    Assert.assertEquals(path7.toString(),
            CrawlDirUtils.findNextLoopDir(_fileSystem, _outputPath, 4).makeQualified(_fileSystem).toString());
}

From source file:boa.aggregators.MLAggregator.java

License:Apache License

public void saveModel(Object model) {
     FSDataOutputStream out = null;//  w  w w.  j a  va 2s .  com
     FileSystem fileSystem = null;
     Path filePath = null;
     try {
         JobContext context = (JobContext) getContext();
         Configuration configuration = context.getConfiguration();
         int boaJobId = configuration.getInt("boa.hadoop.jobid", 0);
         JobConf job = new JobConf(configuration);
         Path outputPath = FileOutputFormat.getOutputPath(job);
         fileSystem = outputPath.getFileSystem(context.getConfiguration());

         fileSystem.mkdirs(new Path("/boa", new Path("" + boaJobId)));
         filePath = new Path("/boa",
                 new Path("" + boaJobId, new Path(("" + getKey()).split("\\[")[0] + "ML.model")));

         if (fileSystem.exists(filePath))
             return;

         out = fileSystem.create(filePath);
         ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
         ObjectOutputStream objectOut = new ObjectOutputStream(byteOutStream);
         objectOut.writeObject(model);
         objectOut.close();

         byte[] serializedObject = byteOutStream.toByteArray();
         out.write(serializedObject, 0, serializedObject.length);

         this.collect(filePath.toString());

     } catch (Exception e) {
         e.printStackTrace();
     } finally {
         try {
             if (out != null)
                 out.close();
         } catch (final Exception e) {
             e.printStackTrace();
         }
     }
 }

From source file:boa.functions.BoaAstIntrinsics.java

License:Apache License

private static void openMap() {
    final Configuration conf = new Configuration();
    try {/*w w w .j  a va2  s .c  o  m*/
        final FileSystem fs = FileSystem.get(conf);
        final Path p = new Path("hdfs://boa-njt/",
                new Path(
                        context.getConfiguration().get("boa.ast.dir",
                                context.getConfiguration().get("boa.input.dir", "repcache/live")),
                        new Path("ast")));
        map = new MapFile.Reader(fs, p.toString(), conf);
    } catch (final Exception e) {
        e.printStackTrace();
    }
}