Example usage for org.apache.hadoop.fs FileSystem mkdirs

List of usage examples for org.apache.hadoop.fs FileSystem mkdirs

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem mkdirs.

Prototype

public boolean mkdirs(Path f) throws IOException 

Source Link

Document

Call #mkdirs(Path,FsPermission) with default permission.

Usage

From source file:com.endgame.binarypig.util.BuildSequenceFileFromArchive.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    File inDirOrFile = new File(args[0]);
    Path outputDir = new Path(args[1]);

    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    if (!fs.exists(outputDir)) {
        fs.mkdirs(outputDir);
    }//from  w w w .  ja  v  a 2  s.co  m

    if (inDirOrFile.isFile()) {
        load(fs, conf, inDirOrFile, outputDir);
    } else {
        for (File file : inDirOrFile.listFiles()) {
            if (!file.isFile()) {
                System.out.println("Skipping " + file + " (not a file) ...");
                continue;
            }

            load(fs, conf, file, outputDir);
        }
    }

    return 0;
}

From source file:com.explorys.apothecary.hbase.mr.inputformat.MergedStoreFileInputFormatTest.java

License:Apache License

protected HRegion createNewHRegion(HTableDescriptor desc, byte[] startKey, byte[] endKey) throws IOException {
    Configuration conf = HBaseConfiguration.create();
    FileSystem filesystem = FileSystem.get(conf);
    Path rootdir = filesystem.makeQualified(new Path(conf.get(HConstants.HBASE_DIR)));
    filesystem.mkdirs(rootdir);

    return HRegion.createHRegion(new HRegionInfo(desc, startKey, endKey), rootdir, conf);
}

From source file:com.facebook.hiveio.common.FileSystems.java

License:Apache License

/**
 * Move a file or directory from source to destination, recursively copying
 * subdirectories.//from www  .j ava  2 s  . c  o  m
 *
 * @param fs FileSystem
 * @param file path to copy (file or directory)
 * @param src path to source directory
 * @param dest path to destination directory
 * @throws IOException I/O problems
 */
public static void move(FileSystem fs, Path file, Path src, Path dest) throws IOException {
    Path destFilePath = pathInDestination(file, src, dest);
    if (fs.isFile(file)) {
        if (fs.exists(destFilePath)) {
            if (!fs.delete(destFilePath, true)) {
                throw new IllegalArgumentException("Could not remove existing file " + destFilePath);
            }
        }
        if (!fs.rename(file, destFilePath)) {
            throw new IllegalArgumentException("Could not move " + file + " to " + destFilePath);
        }
    } else if (fs.getFileStatus(file).isDir()) {
        FileStatus[] statuses = fs.listStatus(file);
        fs.mkdirs(destFilePath);
        if (statuses != null) {
            for (FileStatus status : statuses) {
                move(fs, status.getPath(), src, dest);
            }
        }
    }
}

From source file:com.facebook.hiveio.common.HadoopUtils.java

License:Apache License

/**
 * Set worker output directory/*w ww.ja  v  a 2s  .  c o  m*/
 * @param context Task context
 * @throws IOException I/O errors
 */
public static void setWorkOutputDir(TaskAttemptContext context) throws IOException {
    Configuration conf = context.getConfiguration();
    String outputPath = getOutputDir(conf);
    // we need to do this to get the task path and set it for mapred
    // implementation since it can't be done automatically because of
    // mapreduce->mapred abstraction
    if (outputPath != null) {
        FileOutputCommitter foc = new FileOutputCommitter(getOutputPath(conf), context);
        Path path = foc.getWorkPath();
        FileSystem fs = path.getFileSystem(conf);
        fs.mkdirs(path);
        conf.set("mapred.work.output.dir", path.toString());
        LOG.info("Setting mapred.work.output.dir to {}", path.toString());
    }
}

From source file:com.facebook.LinkBench.LinkBenchDriverMR.java

License:Apache License

/**
 * setup input files for map reduce job//from   ww  w.  ja v a 2s.  c o m
 * @param jobconf configuration of the map reduce job
 * @param nmappers number of mappers (loader or requester)
 */
private static FileSystem setupInputFiles(JobConf jobconf, int nmappers)
        throws IOException, InterruptedException {
    //setup input/output directories
    final Path indir = new Path(TMP_DIR, "in");
    final Path outdir = new Path(TMP_DIR, "out");
    FileInputFormat.setInputPaths(jobconf, indir);
    FileOutputFormat.setOutputPath(jobconf, outdir);

    final FileSystem fs = FileSystem.get(jobconf);
    if (fs.exists(TMP_DIR)) {
        throw new IOException(
                "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(indir)) {
        throw new IOException("Cannot create input directory " + indir);
    }

    //generate an input file for each map task
    if (USE_INPUT_FILES) {
        for (int i = 0; i < nmappers; ++i) {
            final Path file = new Path(indir, "part" + i);
            final IntWritable mapperid = new IntWritable(i);
            final IntWritable nummappers = new IntWritable(nmappers);
            final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobconf, file, IntWritable.class,
                    IntWritable.class, CompressionType.NONE);
            try {
                writer.append(mapperid, nummappers);
            } finally {
                writer.close();
            }
            logger.info("Wrote input for Map #" + i);
        }
    }
    return fs;
}

From source file:com.finderbots.miner2.pinterest.PinterestCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {//from   ww w . ja v a2  s .c  om
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        AnalyzeHtml analyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            analyzer = new AnalyzeHtml(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = PinterestCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, analyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.finderbots.miner2.tomatoes.RTCriticsCrawlAndMinerTool.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();
    CmdLineParser parser = new CmdLineParser(options);

    try {/*w  w  w .  j  a v  a 2 s  . c  om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        printUsageAndExit(parser);
    }

    // Before we get too far along, see if the domain looks valid.
    String domain = options.getDomain();
    String urlsFile = options.getUrlsFile();
    if (domain != null) {
        validateDomain(domain, parser);
    } else {
        if (urlsFile == null) {
            System.err.println(
                    "Either a target domain should be specified or a file with a list of urls needs to be provided");
            printUsageAndExit(parser);
        }
    }

    if (domain != null && urlsFile != null) {
        System.out.println("Warning: Both domain and urls file list provided - using domain");
    }

    String outputDirName = options.getOutputDir();
    if (options.isDebugLogging()) {
        System.setProperty("bixo.root.level", "DEBUG");
    } else {
        System.setProperty("bixo.root.level", "INFO");
    }

    if (options.getLoggingAppender() != null) {
        // Set console vs. DRFA vs. something else
        System.setProperty("bixo.appender", options.getLoggingAppender());
    }

    String logsDir = options.getLogsDir();
    if (!logsDir.endsWith("/")) {
        logsDir = logsDir + "/";
    }

    try {
        JobConf conf = new JobConf();
        Path outputPath = new Path(outputDirName);
        FileSystem fs = outputPath.getFileSystem(conf);

        // First check if the user wants to clean
        if (options.isCleanOutputDir()) {
            if (fs.exists(outputPath)) {
                fs.delete(outputPath, true);
            }
        }

        // See if the user isn't starting from scratch then set up the
        // output directory and create an initial urls subdir.
        if (!fs.exists(outputPath)) {
            fs.mkdirs(outputPath);

            // Create a "0-<timestamp>" sub-directory with just a /crawldb subdir
            // In the /crawldb dir the input file will have a single URL for the target domain.

            Path curLoopDir = CrawlDirUtils.makeLoopDir(fs, outputPath, 0);
            String curLoopDirName = curLoopDir.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, 0);

            Path crawlDbPath = new Path(curLoopDir, CrawlConfig.CRAWLDB_SUBDIR_NAME);

            if (domain != null) {
                importOneDomain(domain, crawlDbPath, conf);
            } else {
                importUrls(urlsFile, crawlDbPath);
            }
        }

        Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, outputPath);

        if (latestDirPath == null) {
            System.err.println("No previous cycle output dirs exist in " + outputDirName);
            printUsageAndExit(parser);
        }

        Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

        // Set up the start and end loop counts.
        int startLoop = CrawlDirUtils.extractLoopNumber(latestDirPath);
        int endLoop = startLoop + options.getNumLoops();

        // Set up the UserAgent for the fetcher.
        UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS,
                CrawlConfig.WEB_ADDRESS);

        // You also get to customize the FetcherPolicy
        FetcherPolicy defaultPolicy;
        if (options.getCrawlDuration() != 0) {
            defaultPolicy = new AdaptiveFetcherPolicy(options.getEndCrawlTime(), options.getCrawlDelay());
        } else {
            defaultPolicy = new FetcherPolicy();
        }
        defaultPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
        defaultPolicy.setRequestTimeout(10L * 1000L);//10 seconds

        // COMPLETE for crawling a single site, EFFICIENT for many sites
        if (options.getCrawlPolicy().equals(Options.IMPOLITE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.IMPOLITE);
        } else if (options.getCrawlPolicy().equals(Options.EFFICIENT_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
        } else if (options.getCrawlPolicy().equals(Options.COMPLETE_CRAWL_POLICY)) {
            defaultPolicy.setFetcherMode(FetcherPolicy.FetcherMode.COMPLETE);
        }

        // It is a good idea to set up a crawl duration when running long crawls as you may
        // end up in situations where the fetch slows down due to a 'long tail' and by
        // specifying a crawl duration you know exactly when the crawl will end.
        int crawlDurationInMinutes = options.getCrawlDuration();
        boolean hasEndTime = crawlDurationInMinutes != Options.NO_CRAWL_DURATION;
        long targetEndTime = hasEndTime
                ? System.currentTimeMillis() + (crawlDurationInMinutes * CrawlConfig.MILLISECONDS_PER_MINUTE)
                : FetcherPolicy.NO_CRAWL_END_TIME;

        // By setting up a url filter we only deal with urls that we want to
        // instead of all the urls that we extract.
        BaseUrlFilter urlFilter = null;
        List<String> patterns = null;
        String regexUrlFiltersFile = options.getRegexUrlFiltersFile();
        if (regexUrlFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlFiltersFile);
        } else {
            patterns = RegexUrlDatumFilter.getDefaultUrlFilterPatterns();
            if (domain != null) {
                String domainPatterStr = "+(?i)^(http|https)://([a-z0-9]*\\.)*" + domain;
                patterns.add(domainPatterStr);
            } else {
                String protocolPatterStr = "+(?i)^(http|https)://*";
                patterns.add(protocolPatterStr);
                //Log.warn("Defaulting to basic url regex filtering (just suffix and protocol");
            }
        }
        urlFilter = new RegexUrlDatumFilter(patterns.toArray(new String[patterns.size()]));

        // get a list of patterns which tell the miner which URLs to include or exclude.
        patterns.clear();
        RegexUrlStringFilter urlsToMineFilter = null;
        String regexUrlsToMineFiltersFile = options.getRegexUrlToMineFile();
        MineRTCriticsPreferences prefsAnalyzer = null;
        if (regexUrlsToMineFiltersFile != null) {
            patterns = RegexUrlDatumFilter.getUrlFilterPatterns(regexUrlsToMineFiltersFile);
            urlsToMineFilter = new RegexUrlStringFilter(patterns.toArray(new String[patterns.size()]));
            prefsAnalyzer = new MineRTCriticsPreferences(urlsToMineFilter);
        }

        // OK, now we're ready to start looping, since we've got our current
        // settings
        for (int curLoop = startLoop + 1; curLoop <= endLoop; curLoop++) {

            // Adjust target end time, if appropriate.
            if (hasEndTime) {
                int remainingLoops = (endLoop - curLoop) + 1;
                long now = System.currentTimeMillis();
                long perLoopTime = (targetEndTime - now) / remainingLoops;
                defaultPolicy.setCrawlEndTime(now + perLoopTime);
            }

            Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, outputPath, curLoop);
            String curLoopDirName = curLoopDirPath.getName();
            setLoopLoggerFile(logsDir + curLoopDirName, curLoop);

            Flow flow = RTCriticsCrawlAndMinerWorkflow.createFlow(curLoopDirPath, crawlDbPath, defaultPolicy,
                    userAgent, urlFilter, prefsAnalyzer, options);
            flow.complete();

            // Writing out .dot files is a good way to verify your flows.
            flow.writeDOT("valid-flow.dot");

            // Update crawlDbPath to point to the latest crawl db
            crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
        }
    } catch (PlannerException e) {
        e.writeDOT("failed-flow.dot");
        System.err.println("PlannerException: " + e.getMessage());
        e.printStackTrace(System.err);
        System.exit(-1);
    } catch (Throwable t) {
        System.err.println("Exception running tool: " + t.getMessage());
        t.printStackTrace(System.err);
        System.exit(-1);
    }
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHFileOutputFormat.java

License:Apache License

@Override
public RecordWriter getHiveRecordWriter(final JobConf jc, final Path finalOutPath,
        Class<? extends Writable> valueClass, boolean isCompressed, Properties tableProperties,
        final Progressable progressable) throws IOException {

    // Read configuration for the target path, first from jobconf, then from table properties
    String hfilePath = getFamilyPath(jc, tableProperties);
    if (hfilePath == null) {
        throw new RuntimeException("Please set " + HFILE_FAMILY_PATH + " to target location for HFiles");
    }// w ww. ja  v a 2 s . co m

    // Target path's last component is also the column family name.
    final Path columnFamilyPath = new Path(hfilePath);
    final String columnFamilyName = columnFamilyPath.getName();
    final byte[] columnFamilyNameBytes = Bytes.toBytes(columnFamilyName);
    final Job job = new Job(jc);
    setCompressOutput(job, isCompressed);
    setOutputPath(job, finalOutPath);

    // Create the HFile writer
    final org.apache.hadoop.mapreduce.TaskAttemptContext tac = ShimLoader.getHadoopShims()
            .newTaskAttemptContext(job.getConfiguration(), progressable);

    final Path outputdir = FileOutputFormat.getOutputPath(tac);
    final org.apache.hadoop.mapreduce.RecordWriter<ImmutableBytesWritable, KeyValue> fileWriter = getFileWriter(
            tac);

    // Individual columns are going to be pivoted to HBase cells,
    // and for each row, they need to be written out in order
    // of column name, so sort the column names now, creating a
    // mapping to their column position.  However, the first
    // column is interpreted as the row key.
    String columnList = tableProperties.getProperty("columns");
    String[] columnArray = columnList.split(",");
    final SortedMap<byte[], Integer> columnMap = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
    int i = 0;
    for (String columnName : columnArray) {
        if (i != 0) {
            columnMap.put(Bytes.toBytes(columnName), i);
        }
        ++i;
    }

    return new RecordWriter() {

        @Override
        public void close(boolean abort) throws IOException {
            try {
                fileWriter.close(null);
                if (abort) {
                    return;
                }
                // Move the hfiles file(s) from the task output directory to the
                // location specified by the user.
                FileSystem fs = outputdir.getFileSystem(jc);
                fs.mkdirs(columnFamilyPath);
                Path srcDir = outputdir;
                for (;;) {
                    FileStatus[] files = fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER);
                    if ((files == null) || (files.length == 0)) {
                        throw new IOException("No family directories found in " + srcDir);
                    }
                    if (files.length != 1) {
                        throw new IOException("Multiple family directories found in " + srcDir);
                    }
                    srcDir = files[0].getPath();
                    if (srcDir.getName().equals(columnFamilyName)) {
                        break;
                    }
                }
                for (FileStatus regionFile : fs.listStatus(srcDir, FileUtils.STAGING_DIR_PATH_FILTER)) {
                    fs.rename(regionFile.getPath(), new Path(columnFamilyPath, regionFile.getPath().getName()));
                }
                // Hive actually wants a file as task output (not a directory), so
                // replace the empty directory with an empty file to keep it happy.
                fs.delete(outputdir, true);
                fs.createNewFile(outputdir);
            } catch (InterruptedException ex) {
                throw new IOException(ex);
            }
        }

        private void writeText(Text text) throws IOException {
            // Decompose the incoming text row into fields.
            String s = text.toString();
            String[] fields = s.split("\u0001");
            assert (fields.length <= (columnMap.size() + 1));
            // First field is the row key.
            byte[] rowKeyBytes = Bytes.toBytes(fields[0]);
            // Remaining fields are cells addressed by column name within row.
            for (Map.Entry<byte[], Integer> entry : columnMap.entrySet()) {
                byte[] columnNameBytes = entry.getKey();
                int iColumn = entry.getValue();
                String val;
                if (iColumn >= fields.length) {
                    // trailing blank field
                    val = "";
                } else {
                    val = fields[iColumn];
                    if ("\\N".equals(val)) {
                        // omit nulls
                        continue;
                    }
                }
                byte[] valBytes = Bytes.toBytes(val);
                KeyValue kv = new KeyValue(rowKeyBytes, columnFamilyNameBytes, columnNameBytes, valBytes);
                try {
                    fileWriter.write(null, kv);
                } catch (IOException e) {
                    LOG.error("Failed while writing row: " + s);
                    throw e;
                } catch (InterruptedException ex) {
                    throw new IOException(ex);
                }
            }
        }

        private void writePut(PutWritable put) throws IOException {
            ImmutableBytesWritable row = new ImmutableBytesWritable(put.getPut().getRow());
            SortedMap<byte[], List<Cell>> cells = put.getPut().getFamilyCellMap();
            for (Map.Entry<byte[], List<Cell>> entry : cells.entrySet()) {
                Collections.sort(entry.getValue(), new CellComparator());
                for (Cell c : entry.getValue()) {
                    try {
                        fileWriter.write(row, KeyValueUtil.copyToNewKeyValue(c));
                    } catch (InterruptedException e) {
                        throw (InterruptedIOException) new InterruptedIOException().initCause(e);
                    }
                }
            }
        }

        @Override
        public void write(Writable w) throws IOException {
            if (w instanceof Text) {
                writeText((Text) w);
            } else if (w instanceof PutWritable) {
                writePut((PutWritable) w);
            } else {
                throw new IOException("Unexpected writable " + w);
            }
        }
    };
}

From source file:com.github.gaoyangthu.demo.mapred.PiEstimator.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi//from w  w  w .j a v  a2 s.c  o  m
 */
public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException {
    //setup job conf
    jobConf.setJobName(PiEstimator.class.getSimpleName());

    jobConf.setInputFormat(SequenceFileInputFormat.class);

    jobConf.setOutputKeyClass(BooleanWritable.class);
    jobConf.setOutputValueClass(LongWritable.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setMapperClass(PiMapper.class);
    jobConf.setNumMapTasks(numMaps);

    jobConf.setReducerClass(PiReducer.class);
    jobConf.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobConf.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(TMP_DIR, "in");
    final Path outDir = new Path(TMP_DIR, "out");
    FileInputFormat.setInputPaths(jobConf, inDir);
    FileOutputFormat.setOutputPath(jobConf, outDir);

    final FileSystem fs = FileSystem.get(jobConf);
    if (fs.exists(TMP_DIR)) {
        throw new IOException(
                "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    try {
        //generate an input file for each map task
        for (int i = 0; i < numMaps; ++i) {
            final Path file = new Path(inDir, "part" + i);
            final LongWritable offset = new LongWritable(i * numPoints);
            final LongWritable size = new LongWritable(numPoints);
            final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class,
                    LongWritable.class, CompressionType.NONE);
            try {
                writer.append(offset, size);
            } finally {
                writer.close();
            }
            System.out.println("Wrote input for Map #" + i);
        }

        //start a map/reduce job
        System.out.println("Starting Job");
        final long startTime = System.currentTimeMillis();
        JobClient.runJob(jobConf);
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        System.out.println("Job Finished in " + duration + " seconds");

        //read outputs
        Path inFile = new Path(outDir, "reduce-out");
        LongWritable numInside = new LongWritable();
        LongWritable numOutside = new LongWritable();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf);
        try {
            reader.next(numInside, numOutside);
        } finally {
            reader.close();
        }

        //compute estimated value
        return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get()))
                .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints));
    } finally {
        fs.delete(TMP_DIR, true);
    }
}

From source file:com.github.libsml.commons.util.HadoopUtils.java

License:Apache License

public static void mkdir(Path path, boolean overwrite) throws IOException {

    Configuration config = new Configuration();
    FileSystem fs = path.getFileSystem(config);
    if (fs.exists(path) && !overwrite) {
        throw new IllegalStateException("Mkdir exception:path=" + path.toString() + " exists");
    }/*from  www  . j  a  va2  s  .  com*/
    if (fs.exists(path)) {
        fs.delete(path, true);
    }
    fs.mkdirs(path);
    fs.close();
}