Example usage for org.apache.hadoop.fs Path Path

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path Path.

Prototype

public Path(URI aUri)

Source Link

Document

Construct a path from a URI

Usage

From source file:WordCount_SiCombiner.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    GenericOptionsParser parser = new GenericOptionsParser(conf, args);
    String[] otherArgs = parser.getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//  ww  w  .  j a  v a 2  s . c  o m
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount_SiCombiner.class);
    job.setMapperClass(TokenizerMapper.class);

    job.setCombinerClass(IntSumReducer.class);

    // disable combiner
    // job.setCombinerClass(IntSumReducer.class);

    job.setPartitionerClass(WordPartition.class);
    job.setNumReduceTasks(5);

    job.setReducerClass(IntSumReducer.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:BooleanRetrievalCompressed.java

License:Apache License

private void initialize(String indexPath, String collectionPath, FileSystem fs) throws IOException {
    index = new MapFile.Reader(new Path(indexPath + "/part-r-00000"), fs.getConf());
    collection = fs.open(new Path(collectionPath));
    stack = new Stack<Set<Integer>>();
}

From source file:HdfsReader.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("HdfsReader [FileSize i.e. 1g/10g/100g/200g]");
        return 1;
    }/*from   w  w  w  . j  a  va2s.  c  o m*/

    double fileSize;
    double fileSizeInMB;
    if (args[0].equals("1g")) {
        fileSize = 1073741824.0;
        fileSizeInMB = 1024.0;
    } else if (args[0].equals("10g")) {
        fileSize = 10737418240.0;
        fileSizeInMB = 10240.0;
    } else if (args[0].equals("100g")) {
        fileSize = 107374182400.0;
        fileSizeInMB = 102400.0;
    } else if (args[0].equals("200g")) {
        fileSize = 214748364800.0;
        fileSizeInMB = 204800.0;
    } else {
        throw new IllegalArgumentException("Invalid arg: " + args[0]);
    }

    String fileName = "read-" + args[0] + "-avg.txt";
    File avgFile = new File(fileName);
    PrintWriter avgPW = new PrintWriter(avgFile);
    fileName = "read-" + args[0] + "-min.txt";
    File minFile = new File(fileName);
    PrintWriter minPW = new PrintWriter(minFile);
    fileName = "read-" + args[0] + "-max.txt";
    File maxFile = new File(fileName);
    PrintWriter maxPW = new PrintWriter(maxFile);

    int numIters = 10;
    int bufferSize = 4096;
    long blockSize[] = new long[] { 67108864, 134217728, 268435456, 536870912, 1073741824 };
    short replication[] = new short[] { 1, 4 };
    String hdfsFile = "/hdfs_test/" + args[0] + "/1.in";
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path hdfsFilePath = new Path(hdfsFile);

    for (int i = 0; i < 5; i++) { // blockSize
        for (int j = 0; j < 2; j++) { // replication
            OutputStream os = fs.create(hdfsFilePath, true, bufferSize, replication[j], blockSize[i]);
            byte[] buf = new byte[bufferSize];
            for (int m = 0; m < bufferSize; m += 4) {
                buf[m] = (byte) m;
            }
            double numBufPerFile = fileSize / (double) bufferSize;

            for (double m = 0.0; m < numBufPerFile; m++) {
                os.write(buf);
            }
            os.close();
            long avg = 0, min = Long.MAX_VALUE, max = Long.MIN_VALUE;
            for (int k = 0; k < numIters; k++) {
                InputStream is = fs.open(hdfsFilePath);

                long startTime = System.currentTimeMillis();
                int bytesRead = is.read(buf);
                while (bytesRead != -1) {
                    bytesRead = is.read(buf);
                }
                is.close();
                long endTime = System.currentTimeMillis();
                long duration = (endTime - startTime);
                avg += duration;
                if (duration < min) {
                    min = duration;
                }
                if (duration > max) {
                    max = duration;
                }
            }
            // write result to output
            double avgBW = fileSizeInMB * 1000.0 * (double) numIters / (double) avg;
            avgPW.print(avgBW);
            avgPW.print("\t");
            double minBW = fileSizeInMB * 1000.0 / (double) max;
            minPW.print(minBW);
            minPW.print("\t");
            double maxBW = fileSizeInMB * 1000.0 / (double) min;
            maxPW.print(maxBW);
            maxPW.print("\t");
        }
        avgPW.println();
        minPW.println();
        maxPW.println();
    }
    avgPW.close();
    minPW.close();
    maxPW.close();
    return 0;
}

From source file:GetRetweetersAndCountPerUser.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 3) {
        System.err.println("Usage: GetRetweetersAndCountPerUser <in> <out> <num_reducers>");
        System.exit(2);/*from  ww w .j a  va 2 s.c o m*/
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(RetweetersPerUser.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    System.out.println(otherArgs[0]);
    job.setMapperClass(TweetMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(Integer.parseInt(args[2]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));

    if (job.waitForCompletion(true)) {
        FileSystem hdfs = FileSystem.get(new URI(args[1]), conf);
        Path dir = new Path(args[1]);
        PathFilter filter = new PathFilter() {
            public boolean accept(Path file) {
                return file.getName().startsWith("part-r-");
            }
        };

        HashMap<Integer, Integer> counts_for_user = new HashMap<Integer, Integer>();
        FileStatus[] files = hdfs.listStatus(dir, filter);
        Arrays.sort(files);
        for (int i = 0; i != files.length; i++) {
            Path pt = files[i].getPath();
            BufferedReader br = new BufferedReader(new InputStreamReader(hdfs.open(pt)));
            String line = null;
            while ((line = br.readLine()) != null) {
                String[] columns = new String[2];
                columns = line.split("\t");
                int key = Integer.parseInt(columns[0]);
                if (counts_for_user.containsKey(key))
                    counts_for_user.put(key, counts_for_user.get(key) + 1);
                else
                    counts_for_user.put(key, 1);
            }
            br.close();
        }

        FSDataOutputStream fsDataOutputStream = hdfs.create(new Path(otherArgs[1] + "_count"));
        PrintWriter writer = new PrintWriter(fsDataOutputStream);
        for (Entry<Integer, Integer> e : counts_for_user.entrySet()) {
            writer.write(e.getKey() + "\t" + e.getValue() + "\n");
        }
        writer.close();
        fsDataOutputStream.close();
        hdfs.close();
        System.exit(0);
    }
    System.exit(1);
}

From source file:WordCount_PerMapTally.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();

    GenericOptionsParser parser = new GenericOptionsParser(conf, args);
    String[] otherArgs = parser.getRemainingArgs();

    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from  w  w w.j av a 2 s  .c  om*/
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(WordCount_PerMapTally.class);
    job.setMapperClass(TokenizerMapper.class);

    // disable combiner
    // job.setCombinerClass(IntSumReducer.class);

    job.setPartitionerClass(WordPartitioner.class);
    job.setNumReduceTasks(5);

    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:SiCombiner.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);/*from w w w . j  a v a 2  s  .  c  o m*/
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(SiCombiner.class);
    job.setMapperClass(TokenizerMapper.class);
    // Aniket changes starts
    /* Here the partitioner is being called*/
    job.setPartitionerClass(WordPartitioner.class);
    // Aniket changes ends
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:AnalyzeBigramRelativeFrequencyTuple.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length != 1) {
        System.out.println("usage: [input-path]");
        System.exit(-1);//from www .j ava2s.c om
    }

    System.out.println("input path: " + args[0]);

    List<PairOfWritables<Tuple, FloatWritable>> pairs = SequenceFileUtils.readDirectory(new Path(args[0]));

    List<PairOfWritables<Tuple, FloatWritable>> list1 = Lists.newArrayList();
    List<PairOfWritables<Tuple, FloatWritable>> list2 = Lists.newArrayList();

    for (PairOfWritables<Tuple, FloatWritable> p : pairs) {
        Tuple bigram = p.getLeftElement();

        if (bigram.get(0).equals("light")) {
            list1.add(p);
        }

        if (bigram.get(0).equals("contain")) {
            list2.add(p);
        }
    }

    Collections.sort(list1, new Comparator<PairOfWritables<Tuple, FloatWritable>>() {
        @SuppressWarnings("unchecked")
        public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }

            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });

    int i = 0;
    for (PairOfWritables<Tuple, FloatWritable> p : list1) {
        Tuple bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
        i++;

        if (i > 10) {
            break;
        }
    }

    Collections.sort(list2, new Comparator<PairOfWritables<Tuple, FloatWritable>>() {
        @SuppressWarnings("unchecked")
        public int compare(PairOfWritables<Tuple, FloatWritable> e1, PairOfWritables<Tuple, FloatWritable> e2) {
            if (e1.getRightElement().compareTo(e2.getRightElement()) == 0) {
                return e1.getLeftElement().compareTo(e2.getLeftElement());
            }

            return e2.getRightElement().compareTo(e1.getRightElement());
        }
    });

    i = 0;
    for (PairOfWritables<Tuple, FloatWritable> p : list2) {
        Tuple bigram = p.getLeftElement();
        System.out.println(bigram + "\t" + p.getRightElement());
        i++;

        if (i > 10) {
            break;
        }
    }
}

From source file:PerMapTally.java

License:Apache License

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
    if (otherArgs.length != 2) {
        System.err.println("Usage: wordcount <in> <out>");
        System.exit(2);//from w w w  .j a va 2 s .  c  o  m
    }
    Job job = new Job(conf, "word count");
    job.setJarByClass(PerMapTally.class);
    job.setMapperClass(TokenizerMapper.class);
    // Aniket changes starts
    /* Here the partitioner is being called*/
    job.setPartitionerClass(WordPartitioner.class);
    // Aniket changes ends
    // Part 3 Aniket changes starts
    /* Here I am just disabling the combiner */
    // job.setCombinerClass(IntSumReducer.class);
    // Part 3 Aniket changes ends
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    FileInputFormat.addInputPath(job, new Path(otherArgs[0]));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:ImportTsv.java

License:Apache License

/**
 * Sets up the actual job./* w w  w . j a  v a 2s.co  m*/
 *
 * @param conf  The current configuration.
 * @param args  The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
        throws IOException, ClassNotFoundException {
    Job job = null;
    try (Connection connection = ConnectionFactory.createConnection(conf)) {
        try (Admin admin = connection.getAdmin()) {
            // Support non-XML supported characters
            // by re-encoding the passed separator as a Base64 string.
            String actualSeparator = conf.get(SEPARATOR_CONF_KEY);
            if (actualSeparator != null) {
                conf.set(SEPARATOR_CONF_KEY, Base64.encodeBytes(actualSeparator.getBytes()));
            }

            // See if a non-default Mapper was set
            String mapperClassName = conf.get(MAPPER_CONF_KEY);
            Class mapperClass = mapperClassName != null ? Class.forName(mapperClassName) : DEFAULT_MAPPER;

            TableName tableName = TableName.valueOf(args[0]);
            Path inputDir = new Path(args[1]);

            // set filter
            conf.set(EASTCOM_FILTER_PARAMS, args[3]);
            conf.set(EASTCOM_FILTER_DEFINE, args[4]);

            String jobName = conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName.getNameAsString());
            job = Job.getInstance(conf, jobName);
            job.setJarByClass(mapperClass);
            FileInputFormat.setInputPaths(job, inputDir);
            job.setInputFormatClass(TextInputFormat.class);
            job.setMapperClass(mapperClass);
            String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
            String columns[] = conf.getStrings(COLUMNS_CONF_KEY);
            if (StringUtils.isNotEmpty(conf.get(CREDENTIALS_LOCATION))) {
                String fileLoc = conf.get(CREDENTIALS_LOCATION);
                Credentials cred = Credentials.readTokenStorageFile(new File(fileLoc), conf);
                job.getCredentials().addAll(cred);
            }

            if (hfileOutPath != null) {
                if (!admin.tableExists(tableName)) {
                    String errorMsg = format("Table '%s' does not exist.", tableName);
                    if ("yes".equalsIgnoreCase(conf.get(CREATE_TABLE_CONF_KEY, "yes"))) {
                        LOG.warn(errorMsg);
                        // TODO: this is backwards. Instead of depending on the existence of a table,
                        // create a sane splits file for HFileOutputFormat based on data sampling.
                        createTable(admin, tableName, columns);
                    } else {
                        LOG.error(errorMsg);
                        throw new TableNotFoundException(errorMsg);
                    }
                }
                try (HTable table = (HTable) connection.getTable(tableName)) {
                    boolean noStrict = conf.getBoolean(NO_STRICT_COL_FAMILY, false);
                    // if no.strict is false then check column family
                    if (!noStrict) {
                        ArrayList<String> unmatchedFamilies = new ArrayList<String>();
                        Set<String> cfSet = getColumnFamilies(columns);
                        HTableDescriptor tDesc = table.getTableDescriptor();
                        for (String cf : cfSet) {
                            if (tDesc.getFamily(Bytes.toBytes(cf)) == null) {
                                unmatchedFamilies.add(cf);
                            }
                        }
                        if (unmatchedFamilies.size() > 0) {
                            ArrayList<String> familyNames = new ArrayList<String>();
                            for (HColumnDescriptor family : table.getTableDescriptor().getFamilies()) {
                                familyNames.add(family.getNameAsString());
                            }
                            String msg = "Column Families " + unmatchedFamilies + " specified in "
                                    + COLUMNS_CONF_KEY + " does not match with any of the table " + tableName
                                    + " column families " + familyNames + ".\n"
                                    + "To disable column family check, use -D" + NO_STRICT_COL_FAMILY
                                    + "=true.\n";
                            usage(msg);
                            System.exit(-1);
                        }
                    }
                    job.setReducerClass(PutSortReducer.class);
                    Path outputDir = new Path(hfileOutPath);
                    FileOutputFormat.setOutputPath(job, outputDir);
                    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
                    if (mapperClass.equals(TsvImporterTextMapper.class)) {
                        job.setMapOutputValueClass(Text.class);
                        job.setReducerClass(TextSortReducer.class);
                    } else {
                        job.setMapOutputValueClass(Put.class);
                        job.setCombinerClass(PutCombiner.class);
                    }
                    HFileOutputFormat2.configureIncrementalLoad(job, table, table);
                }
            } else {
                if (!admin.tableExists(tableName)) {
                    String errorMsg = format("Table '%s' does not exist.", tableName);
                    LOG.error(errorMsg);
                    throw new TableNotFoundException(errorMsg);
                }
                if (mapperClass.equals(TsvImporterTextMapper.class)) {
                    usage(TsvImporterTextMapper.class.toString()
                            + " should not be used for non bulkloading case. use "
                            + TsvImporterMapper.class.toString()
                            + " or custom mapper whose value type is Put.");
                    System.exit(-1);
                }
                // No reducers. Just write straight to table. Call initTableReducerJob
                // to set up the TableOutputFormat.
                TableMapReduceUtil.initTableReducerJob(tableName.getNameAsString(), null, job);
                job.setNumReduceTasks(0);
            }

            TableMapReduceUtil.addDependencyJars(job);
            TableMapReduceUtil.addDependencyJars(job.getConfiguration(),
                    com.google.common.base.Function.class /* Guava used by TsvParser */);
        }
    }
    return job;
}

From source file:Relevance.java

License:Apache License

/**
 * Exact relevance is slower, non-exact relevance will have false positives
 *//*  w ww .  j  a va2  s.  co m*/
protected void batch_query(Tap source, Tap output, Fields wantedFields, RelevanceFunction func, Tap keysTap,
        String keyField, boolean useBloom, int bloom_bits, int bloom_hashes, boolean exact) throws IOException {
    if (!useBloom && !exact)
        throw new IllegalArgumentException("Must either use bloom filter or be exact, or both!");

    FileSystem fs = FileSystem.get(new Configuration());
    Pipe finalPipe = new Pipe("data");
    finalPipe = new Each(finalPipe, wantedFields, new Identity());

    Map<String, Tap> sources = new HashMap<String, Tap>();

    sources.put("data", source);
    Map properties = new HashMap();

    String bloomFilterPath = "/tmp/" + UUID.randomUUID().toString() + ".bloomfilter";
    if (useBloom) {
        String jobId = UUID.randomUUID().toString();

        LOG.info("Creating bloom filter");
        writeOutBloomFilter(keysTap, keyField, fs, bloomFilterPath, bloom_bits, bloom_hashes);
        properties.put("mapred.job.reuse.jvm.num.tasks", -1);
        if (!TEST_MODE) {
            properties.put("mapred.cache.files", "hdfs://" + bloomFilterPath);
        } else {
            properties.put("batch_query.relevance.file", bloomFilterPath);
        }
        LOG.info("Done creating bloom filter");

        finalPipe = new Each(finalPipe, wantedFields, getRelevanceFilter(func, jobId));

    }

    if (exact) {
        sources.put("relevant", keysTap);

        Pipe relevantRecords = new Pipe("relevant");
        relevantRecords = new Each(relevantRecords, new Fields(keyField), new Identity());

        finalPipe = new Each(finalPipe, wantedFields, getExactFilter(func),
                Fields.join(wantedFields, new Fields(ID, RELEVANT_OBJECT)));

        finalPipe = new CoGroup(finalPipe, new Fields(RELEVANT_OBJECT), relevantRecords, new Fields(keyField),
                Fields.join(wantedFields, new Fields(ID, RELEVANT_OBJECT), new Fields("__ignored")));

        finalPipe = new Each(finalPipe, Fields.join(wantedFields, new Fields(ID)), new Identity());

        if (func.canHaveMultipleMatches()) {
            finalPipe = new Distinct(finalPipe, new Fields(ID));
        }
        finalPipe = new Each(finalPipe, wantedFields, new Identity());
    }

    Flow flow = new FlowConnector(properties).connect("Relevance: " + func.getClass().getSimpleName(), sources,
            output, finalPipe);
    flow.complete();

    if (useBloom)
        fs.delete(new Path(bloomFilterPath), false);
}