Example usage for org.apache.hadoop.fs Path getName

List of usage examples for org.apache.hadoop.fs Path getName

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path getName.

Prototype

public String getName() 

Source Link

Document

Returns the final component of this path.

Usage

From source file:azkaban.viewer.hdfs.ImageFileViewer.java

License:Apache License

@Override
public Set<Capability> getCapabilities(FileSystem fs, Path path) throws AccessControlException {
    String fileName = path.getName();
    int pos = fileName.lastIndexOf('.');
    if (pos < 0) {
        return EnumSet.noneOf(Capability.class);
    }/*from   w  w w .ja  va2  s  . c  om*/

    String suffix = fileName.substring(pos).toLowerCase();
    if (acceptedSuffix.contains(suffix)) {
        long len = 0;
        try {
            len = fs.getFileStatus(path).getLen();
        } catch (AccessControlException e) {
            throw e;
        } catch (IOException e) {
            e.printStackTrace();
            return EnumSet.noneOf(Capability.class);
        }

        if (len <= MAX_IMAGE_FILE_SIZE) {
            return EnumSet.of(Capability.READ);
        }
    }

    return EnumSet.noneOf(Capability.class);
}

From source file:azkaban.web.pages.HdfsBrowserServlet.java

License:Apache License

private void displayDir(HttpServletRequest req, HttpServletResponse resp, Path path) throws IOException {

    Page page = newPage(req, resp, "azkaban/web/pages/hdfs_browser_dir.vm");

    List<Path> paths = new ArrayList<Path>();
    List<String> segments = new ArrayList<String>();
    Path curr = path;
    while (curr.getParent() != null) {
        paths.add(curr);//  ww w. j av  a2  s.c  o  m
        segments.add(curr.getName());
        curr = curr.getParent();
    }

    Collections.reverse(paths);
    Collections.reverse(segments);

    page.add("paths", paths);
    page.add("segments", segments);
    page.add("subdirs", _fs.listStatus(path)); // ??? line
    page.render();

}

From source file:azkaban.webapp.servlet.hdfsviewer.BsonFileViewer.java

License:Apache License

@Override
public Set<Capability> getCapabilities(FileSystem fs, Path path) {
    if (path.getName().endsWith(".bson")) {
        return EnumSet.of(Capability.READ);
    }/*  ww  w . j  a v  a 2  s  .  c  om*/
    return EnumSet.noneOf(Capability.class);
}

From source file:azkaban.webapp.servlet.hdfsviewer.ImageFileViewer.java

License:Apache License

public Set<Capability> getCapabilities(FileSystem fs, Path path) {
    String fileName = path.getName();
    int pos = fileName.lastIndexOf('.');
    if (pos < 0) {
        return EnumSet.noneOf(Capability.class);
    }/*from   w w w.  j  a  v  a  2s .co  m*/

    String suffix = fileName.substring(pos).toLowerCase();
    if (acceptedSuffix.contains(suffix)) {
        long len = 0;
        try {
            len = fs.getFileStatus(path).getLen();
        } catch (IOException e) {
            e.printStackTrace();
            return EnumSet.noneOf(Capability.class);
        }

        if (len <= MAX_IMAGE_FILE_SIZE) {
            return EnumSet.of(Capability.READ);
        }
    }

    return EnumSet.noneOf(Capability.class);
}

From source file:babel.prep.extract.NutchPageExtractor.java

License:Apache License

/**
 * Configures the extraction job./* w  w  w . ja va 2s .co m*/
 */
protected JobConf createJobConf(String crawlDir) throws IOException {
    Path segmentsPath = new Path(crawlDir, SEGMENTS_SUBDIR);

    List<Path> segPaths = allSegmentDirs(segmentsPath);
    StringBuilder allSegNames = new StringBuilder();

    for (int i = 0; i < segPaths.size(); i++) {
        allSegNames.append(" " + segPaths.get(i).getName());
    }

    String timeStamp = getCurTimeStamp();

    JobConf job = new NutchJob(getConf());
    job.setJobName("read segments" + allSegNames.toString());

    // Specify what info to extract
    job.setBoolean("segment.reader.co", m_co);
    job.setBoolean("segment.reader.fe", m_fe);
    job.setBoolean("segment.reader.ge", m_ge);
    job.setBoolean("segment.reader.pa", m_pa);
    job.setBoolean("segment.reader.pd", m_pd);
    job.setBoolean("segment.reader.pt", m_pt);

    // Specify the paths to extract from for each segment
    for (int i = 0; i < segPaths.size(); i++) {
        if (m_ge)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.GENERATE_DIR_NAME));
        if (m_fe)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.FETCH_DIR_NAME));
        if (m_pa)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), CrawlDatum.PARSE_DIR_NAME));
        if (m_co)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), Content.DIR_NAME));
        if (m_pd)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseData.DIR_NAME));
        if (m_pt)
            FileInputFormat.addInputPath(job, new Path(segPaths.get(i), ParseText.DIR_NAME));
    }

    // Specify the segments directory so that mapper can recover segment info
    job.set(JOB_PROP_SEGMENTS_DIR, segmentsPath.getName());
    // Store the start time/date of this job
    job.set(JOB_PROP_JOB_TIMESTAMP, timeStamp);

    job.setInputFormat(SequenceFileInputFormat.class);
    job.setMapperClass(PageExtMapper.class);
    job.setReducerClass(PageExtReducer.class);

    job.setMapOutputValueClass(NutchChunk.class);
    job.setOutputFormat(SequenceFileOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Page.class);

    Path outDir = new Path(new Path(crawlDir, PAGES_SUBDIR), "pages.extract." + timeStamp);
    m_fs.delete(outDir, true);

    FileOutputFormat.setOutputPath(job, outDir);

    setUniqueTempDir(job);

    return job;
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.HalvadeMapper.java

protected String checkBinaries(Context context) throws IOException {
    Logger.DEBUG("Checking for binaries...");
    String binDir = null;//from w  w  w . j a  v  a 2s  .c  o  m
    URI[] localPaths = context.getCacheArchives();
    for (int i = 0; i < localPaths.length; i++) {
        Path path = new Path(localPaths[i].getPath());
        if (path.getName().endsWith("bin.tar.gz")) {
            binDir = "./" + path.getName() + "/bin/";
        }
    }
    printDirectoryTree(new File(binDir), 0);
    return binDir;
}

From source file:be.ugent.intec.halvade.hadoop.mapreduce.HalvadeReducer.java

protected String checkBinaries(Reducer.Context context) throws IOException {
    Logger.DEBUG("Checking for binaries...");
    String binDir = null;/* w w  w  .j av a  2  s. c  o  m*/
    URI[] localPaths = context.getCacheArchives();
    for (int i = 0; i < localPaths.length; i++) {
        Path path = new Path(localPaths[i].getPath());
        if (path.getName().endsWith("bin.tar.gz")) {
            binDir = "./" + path.getName() + "/bin/";
        }
    }
    printDirectoryTree(new File(binDir), 0);
    return binDir;
}

From source file:bigsatgps.BigDataHandler.java

License:Open Source License

/**
 *
 * @param infile/*from  w  w w  . j ava 2s. c  om*/
 * @return
 * @throws Exception
 */
public String ImageToSequence(String infile) throws Exception {
    String log4jConfPath = "lib/log4j.properties";
    PropertyConfigurator.configure(log4jConfPath);
    confHadoop = new Configuration();
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
    FileSystem fs = FileSystem.get(confHadoop);
    Path inPath = new Path(infile);
    String outfile = infile.substring(0, infile.indexOf(".")) + ".seq";
    Path outPath = new Path(outfile);
    System.out.println();
    System.out.println("Successfully created the sequencefile " + outfile);
    FSDataInputStream in = null;
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    SequenceFile.Writer writer = null;
    try {
        in = fs.open(inPath);
        byte buffer[] = new byte[in.available()];
        in.read(buffer);
        writer = SequenceFile.createWriter(fs, confHadoop, outPath, key.getClass(), value.getClass());
        writer.append(new Text(inPath.getName()), new BytesWritable(buffer));
        IOUtils.closeStream(writer);
        return outfile;
    } catch (IOException e) {
        System.err.println("Exception MESSAGES = " + e.getMessage());
        IOUtils.closeStream(writer);
        return null;
    }
}

From source file:bixo.examples.crawl.DemoCrawlTool.java

License:Apache License

public static void main(String[] args) {
    DemoCrawlToolOptions options = new DemoCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {//  w ww  .j  av  a2 s  .  c o  m
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user want to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the 
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        // this is to cause Bixo to block waiting for next time it can fetch from a particular site.
        // todo: may not be necessary in future versions of Bixo
        //            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);

        // It is a good idea to set up a crawl duration when running long crawls as you may 
        // end up in situations where the fetch slows down due to a 'long tail' and by 
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != DemoCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = DemoCrawlWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy, userAgent,
                    urlFilter, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            //              flow.writeDOT("build/valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:bixo.examples.crawl.JDBCCrawlTool.java

License:Apache License

public static void main(String[] args) {
    JDBCCrawlToolOptions options = new JDBCCrawlToolOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*from w w w .j av  a2s.  c  om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    if (domain != null) {
        validateDomain(domain, parser);
    }
    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // See if the user is starting from scratch
        if (options.getDbLocation() == null) {
            if (fs.exists(outputPath)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDirName);
                System.out.println("Warning: Delete the output dir before running");
                fs.delete(outputPath, true);
            }
        } else {
            Path dbLocationPath = new Path(options.getDbLocation());
            if (!fs.exists(dbLocationPath)) {
                fs.mkdirs(dbLocationPath);
            }
        }

        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            if (domain == null) {
                System.err.println("For a new crawl the domain needs to be specified" + domain);
                printUsageAndExit(parser);
            }
            importOneDomain(domain, JDBCTapFactory.createUrlsSinkJDBCTap(options.getDbLocation()), conf);
        }

        Path inputPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (inputPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        int startLoop = CrawlDirUtils.extractLoopNumber(inputPath);
        int endLoop = startLoop + options.getNumLoops();

        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        FetcherPolicy defaultPolicy = new FetcherPolicy();
        defaultPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setFetcherMode(FetcherMode.EFFICIENT);

        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != JDBCCrawlToolOptions.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlFilter(patterns.toArray(new String[patterns.size()]));

        // Now we're ready to start looping, since we've got our current settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = JDBCCrawlWorkflow.createFlow(inputPath, curLoopDir, userAgent, defaultPolicy, urlFilter,
                    options.getMaxThreads(), options.isDebugLogging(), options.getDbLocation());
            flow.complete();
            // flow.writeDOT("build/valid-flow.dot");

            // Input for the next round is our current output
            inputPath = curLoopDir;
        }
    } catch (PlannerException e) {
        e.writeDOT("build/failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
    JDBCTapFactory.shutdown();
}