Example usage for org.apache.hadoop.fs FileUtil stat2Paths

List of usage examples for org.apache.hadoop.fs FileUtil stat2Paths

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileUtil stat2Paths.

Prototype

public static Path[] stat2Paths(FileStatus[] stats) 

Source Link

Document

convert an array of FileStatus to an array of Path

Usage

From source file:ph.fingra.hadoop.mapred.common.CopyWithinHdfsFile.java

License:Apache License

public void dirToFile(String srcdir, String dstfile) throws IOException {

    FileSystem shfs = FileSystem.get(URI.create(srcdir), getConf());
    FileSystem thfs = FileSystem.get(URI.create(dstfile), getConf());

    Path srcPath = new Path(srcdir);
    Path dstPath = new Path(dstfile);

    // delete existed destination local file
    if (thfs.exists(dstPath)) {
        thfs.delete(dstPath, true);/*from  w  w w. ja v  a 2  s  . c  o  m*/
    }

    // get hdfs file list
    PathFilter resultFileFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().startsWith(ConstantVars.RESULT_FILE_PREFIX);
        }
    };

    FileStatus[] status = shfs.listStatus(srcPath, resultFileFilter);

    Path[] listedPaths = FileUtil.stat2Paths(status);

    if (listedPaths.length > 0) {
        // create hdfs output stream
        FSDataOutputStream out = thfs.create(dstPath);
        for (int i = 0; i < listedPaths.length; i++) {
            // create hdfs input stream
            FSDataInputStream in = shfs.open(listedPaths[i]);
            byte buffer[] = new byte[256];
            int bytesRead = 0;
            while ((bytesRead = in.read(buffer)) > 0) {
                out.write(buffer, 0, bytesRead);
            }
            in.close();
        }
        out.close();
    }

    return;
}

From source file:ph.fingra.hadoop.mapred.common.HdfsFileUtil.java

License:Apache License

public static int getDateMatchedFileCount(Path srcpath) throws IOException {

    int count = 0;
    Path parentPath = null;//  w w  w. j a v a  2s .  c  om
    String date_ext = null;

    // directory path
    parentPath = srcpath.getParent();

    // date pattern
    Pattern p = Pattern.compile("([0-9]{4})\\-([0-9]{2})\\-([0-9]{2})");

    Matcher m = p.matcher(srcpath.getName());

    if (m.find()) {
        // suffix part like "yyyy-MM-dd.txt" in file name 
        date_ext = srcpath.getName().substring(m.start()/*, m.end()*/);
    }

    Configuration conf = new Configuration();

    FileSystem hdfs = FileSystem.get(conf);

    // get matched file list
    final String suffix = date_ext;
    PathFilter resultFileFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().endsWith(suffix);
        }
    };

    try {
        FileStatus[] status = hdfs.listStatus(parentPath, resultFileFilter);

        if (status != null) {
            Path[] listedPaths = FileUtil.stat2Paths(status);

            if (listedPaths != null) {
                count = listedPaths.length;
            }
        }
    } catch (FileNotFoundException ignore) {
    } catch (InvalidInputException ignore) {
        ; // throw not FileNotFoundException but InvalidInputException
          // at Hadoop 1.x version
    }

    return count;
}

From source file:ph.fingra.hadoop.mapred.common.HdfsFileUtil.java

License:Apache License

public static boolean deleteNBackupFile(String srcdir, String srcfile, int maxcount, String runday,
        final String dbfnameprefix) throws IOException {

    Configuration conf = new Configuration();

    FileSystem hdfs = FileSystem.get(conf);

    Path targetPath = null;/* w  w  w. ja v a  2  s .  c  om*/
    Path rootPath = new Path(srcdir);
    Path sourcePath = new Path(srcfile);
    String target_day = "";
    String target_file = "";
    boolean success = false;

    // if not exist srcfile, stop backup and return true
    if (hdfs.exists(sourcePath) == false) {
        return true;
    }

    // make backup file name as yesterday date
    target_day = DateTimeUtil.addDays(runday, -1, "yyyyMMdd");
    target_file = srcfile + "-" + target_day;
    //System.out.println("target_file - " + target_file);
    targetPath = new Path(target_file);

    // delete backup file if exist same name, then rename source file to backup file
    if (hdfs.exists(new Path(target_file))) {
        hdfs.delete(targetPath, true);
    }
    success = hdfs.rename(sourcePath, targetPath);

    // get bakup file list
    PathFilter resultFileFilter = new PathFilter() {
        @Override
        public boolean accept(Path path) {
            return path.getName().startsWith(dbfnameprefix + "-");
        }
    };

    try {
        FileStatus[] status = hdfs.listStatus(rootPath, resultFileFilter);

        Path[] listedPaths = FileUtil.stat2Paths(status);

        // delete more than maximum number of backup files
        if (listedPaths.length > maxcount) {

            Comparator<Path> c = new Comparator<Path>() {
                public int compare(Path o1, Path o2) {
                    int ret = 0;
                    ret = o1.getName().compareTo(o2.getName());
                    return -(ret); // order by reverse of the period
                }
            };

            Arrays.sort(listedPaths, c);

            for (int i = maxcount; i < listedPaths.length; i++) {
                Path path = listedPaths[i];
                hdfs.delete(path, true);
            }
        }
    } catch (FileNotFoundException ignore) {
    } catch (InvalidInputException ignore) {
        ; // throw not FileNotFoundException but InvalidInputException
          // at Hadoop 1.x version
    }

    return success;
}

From source file:ph.fingra.hadoop.mapred.common.HdfsFileUtil.java

License:Apache License

public static boolean deleteOriginFiles(FingraphConfig config, String year, String month, String day)
        throws IOException {

    Configuration conf = new Configuration();

    FileSystem hdfs = FileSystem.get(conf);

    String root_uri = config.getHadoop_user_path() + (config.getHadoop_user_path().endsWith("/") ? "" : "/")
            + config.getSetting().getHfs_input_path()
            + (config.getSetting().getHfs_input_path().endsWith("/") ? "" : "/");
    root_uri = root_uri.replaceAll("\\{yyyy\\}", year);
    root_uri = root_uri.replaceAll("\\{MM\\}", month);
    root_uri = root_uri.replaceAll("\\{dd\\}", day);
    String file_uri = config.getSetting().getOrigin_input_file();
    file_uri = file_uri.replaceAll("\\{yyyy\\}", year);
    file_uri = file_uri.replaceAll("\\{MM\\}", month);
    file_uri = file_uri.replaceAll("\\{dd\\}", day);
    file_uri = file_uri.replace("*", "[\\w]*");
    final String patt = "^" + file_uri + "$";
    //System.out.println(patt);

    Path rootPath = new Path(root_uri);
    boolean success = false;

    // get matched file list
    PathFilter resultFileFilter = new PathFilter() {
        @Override/*  www. jav  a2  s  .c  om*/
        public boolean accept(Path path) {
            return path.getName().matches(patt);
        }
    };

    try {
        FileStatus[] status = hdfs.listStatus(rootPath, resultFileFilter);

        if (status != null) {
            Path[] listedPaths = FileUtil.stat2Paths(status);

            if (listedPaths != null) {
                for (Path path : listedPaths) {
                    success = hdfs.delete(path, true);
                }
            }
        }
    } catch (FileNotFoundException ignore) {
    } catch (InvalidInputException ignore) {
        ; // throw not FileNotFoundException but InvalidInputException
          // at Hadoop 1.x version
    }

    return success;
}

From source file:si.david.mapreduce.lda.InternalVectorDumper.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    /**/* w w  w  .  j  a va2s. c  o  m*/
     Option seqOpt = obuilder.withLongName("seqFile").withRequired(false).withArgument(
     abuilder.withName("seqFile").withMinimum(1).withMaximum(1).create()).withDescription(
     "The Sequence File containing the Vectors").withShortName("s").create();
     Option dirOpt = obuilder.withLongName("seqDirectory").withRequired(false).withArgument(
     abuilder.withName("seqDirectory").withMinimum(1).withMaximum(1).create())
     .withDescription("The directory containing Sequence File of Vectors")
     .withShortName("d").create();
     */
    addInputOption();
    addOutputOption();
    addOption("useKey", "u", "If the Key is a vector than dump that instead");
    addOption("printKey", "p", "Print out the key as well, delimited by tab (or the value if useKey is true");
    addOption("dictionary", "d", "The dictionary file.", false);
    addOption("dictionaryType", "dt", "The dictionary file type (text|seqfile)", false);
    addOption("csv", "c",
            "Output the Vector as CSV.  Otherwise it substitutes in the terms for vector cell entries");
    addOption("namesAsComments", "n", "If using CSV output, optionally add a comment line for each NamedVector "
            + "(if the vector is one) printing out the name");
    addOption("nameOnly", "N", "Use the name as the value for each NamedVector (skip other vectors)");
    addOption("sortVectors", "sort",
            "Sort output key/value pairs of the vector entries in abs magnitude " + "descending order");
    addOption("quiet", "q", "Print only file contents");
    addOption("sizeOnly", "sz", "Dump only the size of the vector");
    addOption("numItems", "ni", "Output at most <n> vecors", false);
    addOption("vectorSize", "vs",
            "Truncate vectors to <vs> length when dumping (most useful when in" + " conjunction with -sort",
            false);
    addOption(buildOption("filter", "fi",
            "Only dump out those vectors whose name matches the filter."
                    + "  Multiple items may be specified by repeating the argument.",
            true, 1, Integer.MAX_VALUE, false, null));

    if (parseArguments(args, false, true) == null) {
        return -1;
    }

    Path[] pathArr;
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    Path input = getInputPath();
    FileStatus fileStatus = fs.getFileStatus(input);
    if (fileStatus.isDir()) {
        pathArr = FileUtil.stat2Paths(fs.listStatus(input, PathFilters.logsCRCFilter()));
    } else {
        FileStatus[] inputPaths = fs.globStatus(input);
        pathArr = new Path[inputPaths.length];
        int i = 0;
        for (FileStatus fstatus : inputPaths) {
            pathArr[i++] = fstatus.getPath();
        }
    }

    String dictionaryType = getOption("dictionaryType", "text");

    boolean sortVectors = hasOption("sortVectors");
    boolean quiet = hasOption("quiet");
    if (!quiet) {
        log.info("Sort? {}", sortVectors);
    }

    String[] dictionary = null;
    if (hasOption("dictionary")) {
        String dictFile = getOption("dictionary");
        switch (dictionaryType) {
        case "text":
            dictionary = VectorHelper.loadTermDictionary(new File(dictFile));
            break;
        case "sequencefile":
            dictionary = VectorHelper.loadTermDictionary(conf, dictFile);
            break;
        default:
            //TODO: support Lucene's FST as a dictionary type
            throw new IOException("Invalid dictionary type: " + dictionaryType);
        }
    }

    Set<String> filters;
    if (hasOption("filter")) {
        filters = Sets.newHashSet(getOptions("filter"));
    } else {
        filters = null;
    }

    boolean useCSV = hasOption("csv");

    boolean sizeOnly = hasOption("sizeOnly");
    boolean nameOnly = hasOption("nameOnly");
    boolean namesAsComments = hasOption("namesAsComments");
    boolean transposeKeyValue = hasOption("vectorAsKey");
    Writer writer;
    boolean shouldClose;
    File output = getOutputFile();
    if (output != null) {
        shouldClose = true;
        log.info("Output file: {}", output);
        Files.createParentDirs(output);
        writer = Files.newWriter(output, Charsets.UTF_8);
    } else {
        shouldClose = false;
        writer = new OutputStreamWriter(System.out, Charsets.UTF_8);
    }
    try {
        boolean printKey = hasOption("printKey");
        if (useCSV && dictionary != null) {
            writer.write("#");
            for (int j = 0; j < dictionary.length; j++) {
                writer.write(dictionary[j]);
                if (j < dictionary.length - 1) {
                    writer.write(',');
                }
            }
            writer.write('\n');
        }
        Long numItems = null;
        if (hasOption("numItems")) {
            numItems = Long.parseLong(getOption("numItems"));
            if (quiet) {
                writer.append("#Max Items to dump: ").append(String.valueOf(numItems)).append('\n');
            }
        }
        int maxIndexesPerVector = hasOption("vectorSize") ? Integer.parseInt(getOption("vectorSize"))
                : Integer.MAX_VALUE;
        long itemCount = 0;
        int fileCount = 0;
        for (Path path : pathArr) {
            if (numItems != null && numItems <= itemCount) {
                break;
            }
            if (quiet) {
                log.info("Processing file '{}' ({}/{})", path, ++fileCount, pathArr.length);
            }
            SequenceFileIterable<Writable, Writable> iterable = new SequenceFileIterable<>(path, true, conf);
            Iterator<Pair<Writable, Writable>> iterator = iterable.iterator();
            long i = 0;
            while (iterator.hasNext() && (numItems == null || itemCount < numItems)) {
                Pair<Writable, Writable> record = iterator.next();
                Writable keyWritable = record.getFirst();
                Writable valueWritable = record.getSecond();
                if (printKey) {
                    Writable notTheVectorWritable = transposeKeyValue ? valueWritable : keyWritable;
                    writer.write(notTheVectorWritable.toString());
                    writer.write('\t');
                }
                Vector vector;
                try {
                    vector = ((VectorWritable) (transposeKeyValue ? keyWritable : valueWritable)).get();
                } catch (ClassCastException e) {
                    if ((transposeKeyValue ? keyWritable
                            : valueWritable) instanceof WeightedPropertyVectorWritable) {
                        vector = ((WeightedPropertyVectorWritable) (transposeKeyValue ? keyWritable
                                : valueWritable)).getVector();
                    } else {
                        throw e;
                    }
                }
                if (filters == null || !(vector instanceof NamedVector)
                        || filters.contains(((NamedVector) vector).getName())) {
                    if (sizeOnly) {
                        if (vector instanceof NamedVector) {
                            writer.write(((NamedVector) vector).getName());
                            writer.write(":");
                        } else {
                            writer.write(String.valueOf(i++));
                            writer.write(":");
                        }
                        writer.write(String.valueOf(vector.size()));
                        writer.write('\n');
                    } else if (nameOnly) {
                        if (vector instanceof NamedVector) {
                            writer.write(((NamedVector) vector).getName());
                            writer.write('\n');
                        }
                    } else {
                        String fmtStr;
                        if (useCSV) {
                            fmtStr = VectorHelper.vectorToCSVString(vector, namesAsComments);
                        } else {
                            fmtStr = VectorHelper.vectorToJson(vector, dictionary, maxIndexesPerVector,
                                    sortVectors);
                        }
                        writer.write(fmtStr);
                        writer.write('\n');
                    }
                    itemCount++;
                }
            }
        }
        writer.flush();
    } finally {
        if (shouldClose) {
            Closeables.close(writer, false);
        }
    }

    return 0;
}

From source file:StorageEngineClient.CombineColumnStorageFileInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;

    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;//w w  w . ja  v  a2 s  . com
    } else {
        minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = job.getLong("mapred.max.split.size", 0);
    }
    if (maxSize == 0) {
        maxSize = (long) (job.getLong("dfs.block.size", 512 * 1024 * 1024) * 0.8);
    }
    if (minSizeNode == 0) {
        minSizeNode = maxSize / 2;
    }
    if (minSizeRack == 0) {
        minSizeRack = maxSize / 2;
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split size per rack " + minSizeRack);
    }

    Path[] paths = FileUtil.stat2Paths(listStatus(job));
    List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
    if (paths.length == 0) {
        return splits.toArray(new CombineFileSplit[splits.size()]);
    }

    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        for (int i = 0; i < paths.length; i++) {
            if (paths[i] == null) {
                continue;
            }
            Path p = new Path(paths[i].toUri().getPath());
            if (onepool.accept(p)) {
                myPaths.add(paths[i]);
                paths[i] = null;
            }
        }
        getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    ArrayList<Path> myPaths = new ArrayList<Path>();
    for (int i = 0; i < paths.length; i++) {
        if (paths[i] == null) {
            continue;
        }
        myPaths.add(paths[i]);
    }
    LOG.info("myPaths size:\t" + myPaths.size());
    getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits);
    if (splits.size() == 0)
        return super.getSplits(job, numSplits);
    LOG.info("splits #:\t" + splits.size());
    return splits.toArray(new CombineFileSplit[splits.size()]);
}

From source file:StorageEngineClient.CombineFileInputFormat.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;

    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;//from   w  w w . j  a va2 s  .  c  o  m
    } else {
        minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = job.getLong("mapred.max.split.size", 0);
    }
    if (maxSize == 0) {
        maxSize = (long) (job.getLong("dfs.block.size", 512 * 1024 * 1024) * 0.8);
    }
    if (minSizeNode == 0) {
        minSizeNode = maxSize / 2;
    }
    if (minSizeRack == 0) {
        minSizeRack = maxSize / 2;
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split size per rack " + minSizeRack);
    }

    FileStatus[] fsStatus = listStatus(job);
    Path[] paths = FileUtil.stat2Paths(fsStatus);
    Map<String, FileStatus> fileNameToStatus = new HashMap<String, FileStatus>();
    int arraySize = fsStatus.length;
    for (int i = 0; i < arraySize; i++) {
        fileNameToStatus.put(getFileName(paths[i]), fsStatus[i]);
    }

    List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
    if (paths.length == 0) {
        return splits.toArray(new CombineFileSplit[splits.size()]);
    }

    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        for (int i = 0; i < paths.length; i++) {
            if (paths[i] == null) {
                continue;
            }
            Path p = new Path(paths[i].toUri().getPath());
            if (onepool.accept(p)) {
                myPaths.add(paths[i]);
                paths[i] = null;
            }
        }
        getMoreSplitsWithStatus(job, myPaths.toArray(new Path[myPaths.size()]), fileNameToStatus, maxSize,
                minSizeNode, minSizeRack, splits);
    }

    ArrayList<Path> myPaths = new ArrayList<Path>();
    for (int i = 0; i < paths.length; i++) {
        if (paths[i] == null) {
            continue;
        }
        myPaths.add(paths[i]);
    }
    LOG.debug("myPaths size:\t" + myPaths.size());
    try {
        getMoreSplitsWithStatus(job, myPaths.toArray(new Path[myPaths.size()]), fileNameToStatus, maxSize,
                minSizeNode, minSizeRack, splits);
    } catch (NullGzFileException e) {
        throw new IOException(e);
    }
    LOG.debug("splits #:\t" + splits.size());

    return splits.toArray(new CombineFileSplit[splits.size()]);
}

From source file:StorageEngineClient.CombineFileInputFormat_bak.java

License:Open Source License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {

    long minSizeNode = 0;
    long minSizeRack = 0;
    long maxSize = 0;

    if (minSplitSizeNode != 0) {
        minSizeNode = minSplitSizeNode;/*from   www.  j a v  a  2 s.com*/
    } else {
        minSizeNode = job.getLong("mapred.min.split.size.per.node", 0);
    }
    if (minSplitSizeRack != 0) {
        minSizeRack = minSplitSizeRack;
    } else {
        minSizeRack = job.getLong("mapred.min.split.size.per.rack", 0);
    }
    if (maxSplitSize != 0) {
        maxSize = maxSplitSize;
    } else {
        maxSize = job.getLong("mapred.max.split.size", 0);
    }
    if (maxSize == 0) {
        maxSize = (long) (job.getLong("dfs.block.size", 512 * 1024 * 1024) * 0.8);
    }
    if (minSizeNode == 0) {
        minSizeNode = maxSize / 2;
    }
    if (minSizeRack == 0) {
        minSizeRack = maxSize / 2;
    }
    if (minSizeNode != 0 && maxSize != 0 && minSizeNode > maxSize) {
        throw new IOException("Minimum split size pernode " + minSizeNode
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && maxSize != 0 && minSizeRack > maxSize) {
        throw new IOException("Minimum split size per rack" + minSizeRack
                + " cannot be larger than maximum split size " + maxSize);
    }
    if (minSizeRack != 0 && minSizeNode > minSizeRack) {
        throw new IOException("Minimum split size per node" + minSizeNode
                + " cannot be smaller than minimum split size per rack " + minSizeRack);
    }

    Path[] paths = FileUtil.stat2Paths(listStatus(job));
    List<CombineFileSplit> splits = new ArrayList<CombineFileSplit>();
    if (paths.length == 0) {
        return splits.toArray(new CombineFileSplit[splits.size()]);
    }

    for (MultiPathFilter onepool : pools) {
        ArrayList<Path> myPaths = new ArrayList<Path>();

        for (int i = 0; i < paths.length; i++) {
            if (paths[i] == null) {
                continue;
            }
            Path p = new Path(paths[i].toUri().getPath());
            if (onepool.accept(p)) {
                myPaths.add(paths[i]);
                paths[i] = null;
            }
        }
        getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack,
                splits);
    }

    ArrayList<Path> myPaths = new ArrayList<Path>();
    for (int i = 0; i < paths.length; i++) {
        if (paths[i] == null) {
            continue;
        }
        myPaths.add(paths[i]);
    }
    LOG.info("myPaths size:\t" + myPaths.size());
    getMoreSplits(job, myPaths.toArray(new Path[myPaths.size()]), maxSize, minSizeNode, minSizeRack, splits);
    LOG.info("splits #:\t" + splits.size());

    return splits.toArray(new CombineFileSplit[splits.size()]);
}

From source file:translator.MRTranslate1.java

License:Open Source License

public int run(String[] args) throws Exception {
    Job job = new Job(getConf(), "Translate1");
    job.setJarByClass(MRTranslate1.class);
    job.setMapperClass(MRTranslateMapper1.class);
    job.setReducerClass(MRTranslateReducer1.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    //job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    //job.setMapOutputValueClass(Text.class);

    //job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    //FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    TableMapReduceUtil.newJob();//from w w  w .  j  a va  2  s  . c  o  m
    MyFileInputFormat.addInputPath(job, new Path(args[0]));
    byte[] startRow = new byte[1];
    byte[] stopRow = new byte[1];
    startRow[0] = (byte) 1;
    stopRow[0] = (byte) 2;
    TableMapReduceUtil.addCol("", "T", "H2RDF", startRow, stopRow, "A:", job);
    job.setInputFormatClass(FileTableInputFormat.class);

    FileSystem fs = FileSystem.get(conf);

    Path inputDir = new Path(args[0]);
    System.out.println(args[0]);
    int reducer_num = FileUtil.stat2Paths(fs.listStatus(inputDir)).length;

    job.getConfiguration().set("nikos.inputfile", "translate/trans_hash_" + JoinPlaner.id);

    //job.getConfiguration().setInt("mapred.map.tasks", 18);
    job.getConfiguration().setInt("mapred.reduce.tasks", reducer_num);
    job.getConfiguration().setBoolean("mapred.map.tasks.speculative.execution", false);
    job.getConfiguration().setBoolean("mapred.reduce.tasks.speculative.execution", false);
    job.getConfiguration().setInt("io.sort.mb", 100);
    job.getConfiguration().setInt("io.file.buffer.size", 131072);
    job.getConfiguration().setInt("mapred.job.reuse.jvm.num.tasks", -1);

    job.waitForCompletion(true);
    return 0;
}

From source file:tv.icntv.log.stb.commons.HadoopUtils.java

License:Apache License

public static Path[] createFile(Path from, Path to, PathFilter filter, String fromSuffix, String toSuffix,
        String parsed) throws IOException {
    FileSystem fileSystem = null;
    try {/*  w w  w  .j av  a 2s . co m*/
        fileSystem = FileSystem.get(configuration);
        Path[] paths = FileUtil.stat2Paths(fileSystem.listStatus(from, filter));
        List<Path> inputs = Lists.newArrayList();
        for (Path path : paths) {
            //file name
            String name = path.getName().replace(fromSuffix, "");

            if (isExist(new Path(to, name.concat(parsed)))) {
                continue;
            }

            if (createFile(new Path(to, name.concat(toSuffix)))) {
                inputs.add(new Path(from, name));
            }
            ;
        }
        return inputs.toArray(new Path[inputs.size()]);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    } finally {
        if (null != fileSystem) {
            fileSystem.close();
        }
    }
}