Example usage for org.apache.hadoop.mapred LineRecordReader createKey

List of usage examples for org.apache.hadoop.mapred LineRecordReader createKey

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred LineRecordReader createKey.

Prototype

public LongWritable createKey() 

Source Link

Usage

From source file:edu.umn.cs.spatialHadoop.mapred.SpatialInputFormat.java

License:Open Source License

protected void listStatus(final FileSystem fs, Path dir, final List<FileStatus> result, BlockFilter filter)
        throws IOException {
    GlobalIndex<Partition> gindex = SpatialSite.getGlobalIndex(fs, dir);
    if (gindex == null) {
        FileStatus[] listStatus;//w w  w .  j  a  va  2s.co m
        if (OperationsParams.isWildcard(dir)) {
            // Wild card
            listStatus = fs.globStatus(dir);
        } else {
            listStatus = fs.listStatus(dir, SpatialSite.NonHiddenFileFilter);
        }
        // Add all files under this directory
        for (FileStatus status : listStatus) {
            if (status.isDir()) {
                listStatus(fs, status.getPath(), result, filter);
            } else if (status.getPath().getName().toLowerCase().endsWith(".list")) {
                LineRecordReader in = new LineRecordReader(fs.open(status.getPath()), 0, status.getLen(),
                        Integer.MAX_VALUE);
                LongWritable key = in.createKey();
                Text value = in.createValue();
                while (in.next(key, value)) {
                    result.add(fs.getFileStatus(new Path(status.getPath().getParent(), value.toString())));
                }
                in.close();
            } else {
                result.add(status);
            }
        }
    } else {
        final Path indexDir = OperationsParams.isWildcard(dir) ? dir.getParent() : dir;
        // Use the global index to limit files
        filter.selectCells(gindex, new ResultCollector<Partition>() {
            @Override
            public void collect(Partition partition) {
                try {
                    Path cell_path = new Path(indexDir, partition.filename);
                    if (!fs.exists(cell_path))
                        LOG.warn("Matched file not found: " + cell_path);
                    result.add(fs.getFileStatus(cell_path));
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        });
    }
}

From source file:edu.umn.cs.spatialHadoop.operations.CatUnion.java

License:Open Source License

/**
 * Read all categories from the category file
 * @param categoryFile/*from  w ww. j ava 2s .  co  m*/
 * @param categoryShapes
 * @param idToCategory
 * @throws IOException
 */
private static void readCategories(Path categoryFile, Map<Integer, Integer> idToCategory) throws IOException {
    Map<Integer, String> idToCatName = new HashMap<Integer, String>();
    FileSystem fsCategory = FileSystem.getLocal(new Configuration());
    long categoryFileSize = fsCategory.getFileStatus(categoryFile).getLen();
    if (categoryFileSize > 1024 * 1024)
        LOG.warn("Category file size is big: " + categoryFileSize);
    InputStream inCategory = fsCategory.open(categoryFile);
    LineRecordReader lineReader = new LineRecordReader(inCategory, 0, categoryFileSize, new Configuration());
    LongWritable lineOffset = lineReader.createKey();
    Text line = lineReader.createValue();

    Set<String> catNames = new TreeSet<String>();

    while (lineReader.next(lineOffset, line)) {
        int shape_id = TextSerializerHelper.consumeInt(line, ',');
        String cat_name = line.toString();
        catNames.add(cat_name);
        idToCatName.put(shape_id, cat_name);
    }

    lineReader.close();

    // Change category names to numbers
    Map<String, Integer> cat_name_to_id = new HashMap<String, Integer>();
    int cat_id = 0;
    for (String cat_name : catNames) {
        cat_name_to_id.put(cat_name, cat_id++);
    }

    for (Map.Entry<Integer, String> entry : idToCatName.entrySet()) {
        idToCategory.put(entry.getKey(), cat_name_to_id.get(entry.getValue()));
    }
}

From source file:mlbench.pagerank.PagerankMerge.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException, InterruptedException {
    try {/*from w w w. j  a v  a 2  s  . co  m*/
        parseArgs(args);
        HashMap<String, String> conf = new HashMap<String, String>();
        initConf(conf);
        MPI_D.Init(args, MPI_D.Mode.Common, conf);

        JobConf jobConf = new JobConf(confPath);
        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
            if (rank == 0) {
                LOG.info(PagerankMerge.class.getSimpleName() + " O start.");
            }
            FileSplit[] inputs = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O, jobConf,
                    inDir, rank);
            for (int i = 0; i < inputs.length; i++) {
                FileSplit fsplit = inputs[i];
                LineRecordReader kvrr = new LineRecordReader(jobConf, fsplit);

                LongWritable key = kvrr.createKey();
                Text value = kvrr.createValue();
                {
                    while (kvrr.next(key, value)) {
                        String line_text = value.toString();
                        final String[] line = line_text.split("\t");
                        if (line.length >= 2) {
                            MPI_D.Send(new IntWritable(Integer.parseInt(line[0])), new Text(line[1]));
                        }
                    }
                }
            }

        } else if (MPI_D.COMM_BIPARTITE_A != null) {
            // A communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
            if (rank == 0) {
                LOG.info(PagerankMerge.class.getSimpleName() + " A start.");
            }
            HadoopWriter<IntWritable, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir,
                    IntWritable.class, Text.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A);

            IntWritable oldKey = null;
            double next_rank = 0;
            double previous_rank = 0;
            double diff = 0;
            int local_diffs = 0;
            random_coeff = (1 - mixing_c) / (double) number_nodes;
            converge_threshold = ((double) 1.0 / (double) number_nodes) / 10;
            Object[] keyValue = MPI_D.Recv();
            while (keyValue != null) {
                IntWritable key = (IntWritable) keyValue[0];
                Text value = (Text) keyValue[1];
                if (oldKey == null) {
                    oldKey = key;
                }
                if (!key.equals(oldKey)) {
                    next_rank = next_rank * mixing_c + random_coeff;
                    outrw.write(oldKey, new Text("v" + next_rank));
                    diff = Math.abs(previous_rank - next_rank);
                    if (diff > converge_threshold) {
                        local_diffs += 1;
                    }
                    oldKey = key;
                    next_rank = 0;
                    previous_rank = 0;
                }

                String cur_value_str = value.toString();
                if (cur_value_str.charAt(0) == 's') {
                    previous_rank = Double.parseDouble(cur_value_str.substring(1));
                } else {
                    next_rank += Double.parseDouble(cur_value_str.substring(1));
                }

                keyValue = MPI_D.Recv();
            }
            if (previous_rank != 0) {
                next_rank = next_rank * mixing_c + random_coeff;
                outrw.write(oldKey, new Text("v" + next_rank));
                diff = Math.abs(previous_rank - next_rank);
                if (diff > converge_threshold)
                    local_diffs += 1;
            }
            outrw.close();
            reduceDiffs(local_diffs, rank);
        }

        MPI_D.Finalize();
    } catch (MPI_D_Exception e) {
        e.printStackTrace();
    }
}

From source file:mlbench.pagerank.PagerankNaive.java

License:Apache License

@SuppressWarnings({ "rawtypes", "unchecked" })
public static void main(String[] args) throws IOException, InterruptedException {
    try {/*from   ww w .  j  ava  2s .  c  o  m*/
        parseArgs(args);
        HashMap<String, String> conf = new HashMap<String, String>();
        initConf(conf);
        MPI_D.Init(args, MPI_D.Mode.Common, conf);

        JobConf jobConf = new JobConf(confPath);
        if (MPI_D.COMM_BIPARTITE_O != null) {
            // O communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_O);
            int size = MPI_D.Comm_size(MPI_D.COMM_BIPARTITE_O);
            if (rank == 0) {
                LOG.info(PagerankNaive.class.getSimpleName() + " O start.");
            }
            FileSplit[] inputs1 = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                    jobConf, edgeDir, rank);
            FileSplit[] inputs2 = DataMPIUtil.HDFSDataLocalLocator.getTaskInputs(MPI_D.COMM_BIPARTITE_O,
                    jobConf, vecDir, rank);
            FileSplit[] inputs = (FileSplit[]) ArrayUtils.addAll(inputs2, inputs1);
            for (int i = 0; i < inputs.length; i++) {
                FileSplit fsplit = inputs[i];
                LineRecordReader kvrr = new LineRecordReader(jobConf, fsplit);

                LongWritable key = kvrr.createKey();
                Text value = kvrr.createValue();
                {
                    IntWritable k = new IntWritable();
                    Text v = new Text();
                    while (kvrr.next(key, value)) {
                        String line_text = value.toString();
                        // ignore comments in edge file
                        if (line_text.startsWith("#"))
                            continue;

                        final String[] line = line_text.split("\t");
                        if (line.length < 2)
                            continue;

                        // vector : ROWID VALUE('vNNNN')
                        if (line[1].charAt(0) == 'v') {
                            k.set(Integer.parseInt(line[0]));
                            v.set(line[1]);
                            MPI_D.Send(k, v);
                        } else {
                            /*
                             * In other matrix-vector multiplication, we
                            * output (dst, src) here However, In PageRank,
                            * the matrix-vector computation formula is M^T
                            * * v. Therefore, we output (src,dst) here.
                            */
                            int src_id = Integer.parseInt(line[0]);
                            int dst_id = Integer.parseInt(line[1]);
                            k.set(src_id);
                            v.set(line[1]);
                            MPI_D.Send(k, v);

                            if (make_symmetric == 1) {
                                k.set(dst_id);
                                v.set(line[0]);
                                MPI_D.Send(k, v);
                            }
                        }
                    }
                }
            }

        } else if (MPI_D.COMM_BIPARTITE_A != null) {
            // A communicator
            int rank = MPI_D.Comm_rank(MPI_D.COMM_BIPARTITE_A);
            if (rank == 0) {
                LOG.info(PagerankNaive.class.getSimpleName() + " A start.");
            }

            HadoopWriter<IntWritable, Text> outrw = HadoopIOUtil.getNewWriter(jobConf, outDir,
                    IntWritable.class, Text.class, TextOutputFormat.class, null, rank, MPI_D.COMM_BIPARTITE_A);

            IntWritable oldKey = null;
            int i;
            double cur_rank = 0;
            ArrayList<Integer> dst_nodes_list = new ArrayList<Integer>();
            Object[] keyValue = MPI_D.Recv();
            while (keyValue != null) {
                IntWritable key = (IntWritable) keyValue[0];
                Text value = (Text) keyValue[1];
                if (oldKey == null) {
                    oldKey = key;
                }
                // A new key arrives
                if (!key.equals(oldKey)) {
                    outrw.write(oldKey, new Text("s" + cur_rank));
                    int outdeg = dst_nodes_list.size();
                    if (outdeg > 0) {
                        cur_rank = cur_rank / (double) outdeg;
                    }
                    for (i = 0; i < outdeg; i++) {
                        outrw.write(new IntWritable(dst_nodes_list.get(i)), new Text("v" + cur_rank));
                    }
                    oldKey = key;
                    cur_rank = 0;
                    dst_nodes_list = new ArrayList<Integer>();
                }
                // common record
                String line_text = value.toString();
                final String[] line = line_text.split("\t");
                if (line.length == 1) {
                    if (line_text.charAt(0) == 'v') { // vector : VALUE
                        cur_rank = Double.parseDouble(line_text.substring(1));
                    } else { // edge : ROWID
                        dst_nodes_list.add(Integer.parseInt(line[0]));
                    }
                }
                keyValue = MPI_D.Recv();
            }
            // write the left part
            if (cur_rank != 0) {
                outrw.write(oldKey, new Text("s" + cur_rank));
                int outdeg = dst_nodes_list.size();
                if (outdeg > 0) {
                    cur_rank = cur_rank / (double) outdeg;
                }
                for (i = 0; i < outdeg; i++) {
                    outrw.write(new IntWritable(dst_nodes_list.get(i)), new Text("v" + cur_rank));
                }
            }
            outrw.close();
        }
        MPI_D.Finalize();
    } catch (MPI_D_Exception e) {
        e.printStackTrace();
    }
}