Example usage for org.apache.hadoop.io Text Text

List of usage examples for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text() 

Source Link

Usage

From source file:authordetect.input.SingleBookReader.java

/**
 * @param inputSplit//from w w  w.  j av a  2s  . com
 * @param context    the information about the task
 * @throws java.io.IOException
 * @throws InterruptedException
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) inputSplit;
    Configuration configuration = context.getConfiguration();

    // get the option from configuration:
    // 0 for group by author, 1 for group by book
    int option = configuration.getInt("GROUP_OPTION", 0);

    Path path = split.getPath();
    filename = path.getName();
    FileSystem fileSystem = path.getFileSystem(configuration);
    FSDataInputStream inputStream = fileSystem.open(path);
    lineReader = new LineReader(inputStream, configuration);

    //initial start point and end point
    start = split.getStart();
    end = start + split.getLength();

    inputStream.seek(start);
    if (start != 0) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }

    start += lineReader.readLine(currentLine);

    prepareToScanBook(option);
}

From source file:be.uantwerpen.adrem.disteclat.ItemReaderReducer.java

License:Apache License

/**
 * Writes the singletons distribution to file OSingletonsDistribution. The distribution is obtained using Round-Robin
 * allocation./*from  w  w  w  .ja  va  2s.  c o m*/
 * 
 * @param sortedSingletons
 *          the sorted list of singletons
 * @throws IOException
 * @throws InterruptedException
 */
private void writeSingletonsDistribution(List<Integer> sortedSingletons)
        throws IOException, InterruptedException {
    int end = Math.min(numberOfMappers, sortedSingletons.size());

    Text mapperId = new Text();
    Text assignedItems = new Text();

    // Round robin assignment
    for (int ix = 0; ix < end; ix++) {
        StringBuilder sb = new StringBuilder();
        for (int ix1 = ix; ix1 < sortedSingletons.size(); ix1 += numberOfMappers) {
            sb.append(sortedSingletons.get(ix1)).append(" ");
        }

        mapperId.set("" + ix);
        assignedItems.set(sb.substring(0, sb.length() - 1));
        mos.write(OSingletonsDistribution, mapperId, assignedItems);
    }
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the different file splits for the data based on a given number of splits
 * /*  www.  j  av a  2s . c  om*/
 * @param status
 *          file status
 * @param conf
 *          hadoop configuration object
 * @param numberOfSplits
 *          number of splits to split the data in
 * @return list of file splits
 * @throws IOException
 *           thrown if the file does not exist
 */
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits)
        throws IOException {
    List<FileSplit> splits = newArrayList();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }
    long totalNumberOfLines = getTotalNumberOfLines(conf, fileName);
    int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits);
    LineReader lr = null;
    FSDataInputStream in = null;
    try {
        in = fileName.getFileSystem(conf).open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                splits.add(createFileSplit(fileName, begin, length));
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(createFileSplit(fileName, begin, length));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
        if (in != null) {
            in.close();
        }
    }
    return splits;
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned.
 * //from  ww w. j  a va 2  s  . c o  m
 * @param conf
 *          hadoop configuration object
 * @param fileName
 *          name of file to count
 * @return the number of lines in the file
 * @throws IOException
 */
public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException {
    long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1);
    if (nrLines != -1) {
        return nrLines;
    }

    try {
        FSDataInputStream in = fileName.getFileSystem(conf).open(fileName);
        LineReader lr = new LineReader(in, conf);
        Text text = new Text();
        nrLines = 0;
        while (lr.readLine(text) > 0) {
            nrLines++;
        }
        in.close();
        return nrLines;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return 0;
}

From source file:be.ugent.intec.halvade.tools.AlignerInstance.java

License:Open Source License

protected AlignerInstance(Mapper.Context context, String bin) throws IOException, URISyntaxException {
    AlignerInstance.context = context;//  ww  w.  j a  va  2  s.  com
    header = null;
    containers = HalvadeConf.getMapContainerCount(context.getConfiguration());
    tasksLeft = HalvadeConf.getMapTasksLeft(context.getConfiguration());
    redistribute = HalvadeConf.getRedistribute(context.getConfiguration());
    writableRecord = new SAMRecordWritable();
    writableRegion = new ChromosomeRegion();
    writeableCompactRegion = new GenomeSJ();
    stub = new Text();
    minChrLength = HalvadeConf.getMinChrLength(context.getConfiguration());
    chr = HalvadeConf.getChrList(context.getConfiguration());

    tmpdir = HalvadeConf.getScratchTempDir(context.getConfiguration());
    if (!tmpdir.endsWith("/"))
        tmpdir = tmpdir + "/";
    File tmp = new File(tmpdir);
    tmp.mkdirs();
    this.bin = bin;
    threads = HalvadeConf.getMapThreads(context.getConfiguration());
    isPaired = HalvadeConf.getIsPaired(context.getConfiguration());
    Logger.DEBUG("paired? " + isPaired);
    splitter = new ChromosomeSplitter(HalvadeConf.getBedRegions(context.getConfiguration()),
            context.getConfiguration());
    keepChrSplitPairs = HalvadeConf.getkeepChrSplitPairs(context.getConfiguration());
    keep = HalvadeConf.getKeepFiles(context.getConfiguration());
}

From source file:be.ugent.intec.halvade.tools.STARInstance.java

License:Open Source License

private void emitJSFile(String starOutDir, Mapper.Context context) throws InterruptedException {
    SAMSequenceDictionary dict = null;// www  .jav  a 2  s  .  c  o m
    try {
        dict = HalvadeConf.getSequenceDictionary(context.getConfiguration());
    } catch (IOException ex) {
        Logger.EXCEPTION(ex);
        throw new InterruptedException("Error getting the SAMSequenceDictionary for SJ processing");
    }
    BufferedReader br = null;
    val = new Text();
    sj = new GenomeSJ();
    try {
        br = new BufferedReader(new FileReader(starOutDir + "/SJ.out.tab"));
        String line = br.readLine();
        sj.parseSJString(line, dict);
        while (line != null) {
            val.set(line);
            context.write(sj, val);
            line = br.readLine();
        }
    } catch (IOException | InterruptedException ex) {
        Logger.EXCEPTION(ex);
    } finally {
        if (br != null) {
            try {
                br.close();
            } catch (IOException ex) {
                Logger.EXCEPTION(ex);
            }
        }
    }
}

From source file:bigsatgps.BigDataHandler.java

License:Open Source License

/**
 *
 * @param infile//from  w w w  .ja  v  a2s  . c  o m
 * @return
 * @throws Exception
 */
public String ImageToSequence(String infile) throws Exception {
    String log4jConfPath = "lib/log4j.properties";
    PropertyConfigurator.configure(log4jConfPath);
    confHadoop = new Configuration();
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
    FileSystem fs = FileSystem.get(confHadoop);
    Path inPath = new Path(infile);
    String outfile = infile.substring(0, infile.indexOf(".")) + ".seq";
    Path outPath = new Path(outfile);
    System.out.println();
    System.out.println("Successfully created the sequencefile " + outfile);
    FSDataInputStream in = null;
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    SequenceFile.Writer writer = null;
    try {
        in = fs.open(inPath);
        byte buffer[] = new byte[in.available()];
        in.read(buffer);
        writer = SequenceFile.createWriter(fs, confHadoop, outPath, key.getClass(), value.getClass());
        writer.append(new Text(inPath.getName()), new BytesWritable(buffer));
        IOUtils.closeStream(writer);
        return outfile;
    } catch (IOException e) {
        System.err.println("Exception MESSAGES = " + e.getMessage());
        IOUtils.closeStream(writer);
        return null;
    }
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }//ww w  . j a  v  a 2  s.c  om
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }//from w  ww.j a  v a  2 s. com
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:boa.datagen.SeqProjectCombiner.java

License:Apache License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    conf.set("fs.default.name", "hdfs://boa-njt/");
    FileSystem fileSystem = FileSystem.get(conf);
    String base = conf.get("fs.default.name", "");

    HashMap<String, String> sources = new HashMap<String, String>();
    HashSet<String> marks = new HashSet<String>();
    FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-07"));
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];/*from w  ww.j ava 2 s. c o m*/
        String name = file.getPath().getName();
        if (name.startsWith("projects-") && name.endsWith(".seq")) {
            System.out.println("Reading file " + i + " in " + files.length + ": " + name);
            SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
            final Text key = new Text();
            final BytesWritable value = new BytesWritable();
            try {
                while (r.next(key, value)) {
                    String s = key.toString();
                    if (marks.contains(s))
                        continue;
                    Project p = Project
                            .parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength()));
                    if (p.getCodeRepositoriesCount() > 0 && p.getCodeRepositories(0).getRevisionsCount() > 0)
                        marks.add(s);
                    sources.put(s, name);
                }
            } catch (Exception e) {
                System.err.println(name);
                e.printStackTrace();
            }
            r.close();
        }
    }
    SequenceFile.Writer w = SequenceFile.createWriter(fileSystem, conf,
            new Path(base + "repcache/2015-07/projects.seq"), Text.class, BytesWritable.class);
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];
        String name = file.getPath().getName();
        if (name.startsWith("projects-") && name.endsWith(".seq")) {
            System.out.println("Reading file " + i + " in " + files.length + ": " + name);
            SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
            final Text key = new Text();
            final BytesWritable value = new BytesWritable();
            try {
                while (r.next(key, value)) {
                    String s = key.toString();
                    if (sources.get(s).equals(name))
                        w.append(key, value);
                }
            } catch (Exception e) {
                System.err.println(name);
                e.printStackTrace();
            }
            r.close();
        }
    }
    w.close();

    fileSystem.close();
}