Example usage for org.apache.hadoop.io Text Text

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text Text.

Prototype

public Text()

Source Link

Usage

From source file:authordetect.input.SingleBookReader.java

/**
 * @param inputSplit//from w w  w.  j av a  2s  . com
 * @param context    the information about the task
 * @throws java.io.IOException
 * @throws InterruptedException
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    FileSplit split = (FileSplit) inputSplit;
    Configuration configuration = context.getConfiguration();

    // get the option from configuration:
    // 0 for group by author, 1 for group by book
    int option = configuration.getInt("GROUP_OPTION", 0);

    Path path = split.getPath();
    filename = path.getName();
    FileSystem fileSystem = path.getFileSystem(configuration);
    FSDataInputStream inputStream = fileSystem.open(path);
    lineReader = new LineReader(inputStream, configuration);

    //initial start point and end point
    start = split.getStart();
    end = start + split.getLength();

    inputStream.seek(start);
    if (start != 0) {
        start += lineReader.readLine(new Text(), 0, (int) Math.min(Integer.MAX_VALUE, end - start));
    }

    start += lineReader.readLine(currentLine);

    prepareToScanBook(option);
}

From source file:be.uantwerpen.adrem.disteclat.ItemReaderReducer.java

License:Apache License

/**
 * Writes the singletons distribution to file OSingletonsDistribution. The distribution is obtained using Round-Robin
 * allocation./*from  w  w  w  .ja  va  2s.  c o m*/
 * 
 * @param sortedSingletons
 *          the sorted list of singletons
 * @throws IOException
 * @throws InterruptedException
 */
private void writeSingletonsDistribution(List<Integer> sortedSingletons)
        throws IOException, InterruptedException {
    int end = Math.min(numberOfMappers, sortedSingletons.size());

    Text mapperId = new Text();
    Text assignedItems = new Text();

    // Round robin assignment
    for (int ix = 0; ix < end; ix++) {
        StringBuilder sb = new StringBuilder();
        for (int ix1 = ix; ix1 < sortedSingletons.size(); ix1 += numberOfMappers) {
            sb.append(sortedSingletons.get(ix1)).append(" ");
        }

        mapperId.set("" + ix);
        assignedItems.set(sb.substring(0, sb.length() - 1));
        mos.write(OSingletonsDistribution, mapperId, assignedItems);
    }
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the different file splits for the data based on a given number of splits
 * /*  www.  j  av a  2s . c  om*/
 * @param status
 *          file status
 * @param conf
 *          hadoop configuration object
 * @param numberOfSplits
 *          number of splits to split the data in
 * @return list of file splits
 * @throws IOException
 *           thrown if the file does not exist
 */
public static List<FileSplit> getSplitsForFile(FileStatus status, Configuration conf, int numberOfSplits)
        throws IOException {
    List<FileSplit> splits = newArrayList();
    Path fileName = status.getPath();
    if (status.isDir()) {
        throw new IOException("Not a file: " + fileName);
    }
    long totalNumberOfLines = getTotalNumberOfLines(conf, fileName);
    int numLinesPerSplit = (int) Math.ceil(1.0 * totalNumberOfLines / numberOfSplits);
    LineReader lr = null;
    FSDataInputStream in = null;
    try {
        in = fileName.getFileSystem(conf).open(fileName);
        lr = new LineReader(in, conf);
        Text line = new Text();
        int numLines = 0;
        long begin = 0;
        long length = 0;
        int num = -1;
        while ((num = lr.readLine(line)) > 0) {
            numLines++;
            length += num;
            if (numLines == numLinesPerSplit) {
                splits.add(createFileSplit(fileName, begin, length));
                begin += length;
                length = 0;
                numLines = 0;
            }
        }
        if (numLines != 0) {
            splits.add(createFileSplit(fileName, begin, length));
        }
    } finally {
        if (lr != null) {
            lr.close();
        }
        if (in != null) {
            in.close();
        }
    }
    return splits;
}

From source file:be.uantwerpen.adrem.hadoop.util.SplitByKTextInputFormat.java

License:Apache License

/**
 * Gets the total number of lines from the file. If Config.NUMBER_OF_LINES_KEY is set, this value is returned.
 * //from  ww w. j  a va 2  s  . c o  m
 * @param conf
 *          hadoop configuration object
 * @param fileName
 *          name of file to count
 * @return the number of lines in the file
 * @throws IOException
 */
public static long getTotalNumberOfLines(Configuration conf, Path fileName) throws IOException {
    long nrLines = conf.getLong(NUMBER_OF_LINES_KEY, -1);
    if (nrLines != -1) {
        return nrLines;
    }

    try {
        FSDataInputStream in = fileName.getFileSystem(conf).open(fileName);
        LineReader lr = new LineReader(in, conf);
        Text text = new Text();
        nrLines = 0;
        while (lr.readLine(text) > 0) {
            nrLines++;
        }
        in.close();
        return nrLines;
    } catch (IOException e) {
        e.printStackTrace();
    }
    return 0;
}

From source file:be.ugent.intec.halvade.tools.AlignerInstance.java

License:Open Source License

protected AlignerInstance(Mapper.Context context, String bin) throws IOException, URISyntaxException {
    AlignerInstance.context = context;//  ww  w.  j a  va  2  s.  com
    header = null;
    containers = HalvadeConf.getMapContainerCount(context.getConfiguration());
    tasksLeft = HalvadeConf.getMapTasksLeft(context.getConfiguration());
    redistribute = HalvadeConf.getRedistribute(context.getConfiguration());
    writableRecord = new SAMRecordWritable();
    writableRegion = new ChromosomeRegion();
    writeableCompactRegion = new GenomeSJ();
    stub = new Text();
    minChrLength = HalvadeConf.getMinChrLength(context.getConfiguration());
    chr = HalvadeConf.getChrList(context.getConfiguration());

    tmpdir = HalvadeConf.getScratchTempDir(context.getConfiguration());
    if (!tmpdir.endsWith("/"))
        tmpdir = tmpdir + "/";
    File tmp = new File(tmpdir);
    tmp.mkdirs();
    this.bin = bin;
    threads = HalvadeConf.getMapThreads(context.getConfiguration());
    isPaired = HalvadeConf.getIsPaired(context.getConfiguration());
    Logger.DEBUG("paired? " + isPaired);
    splitter = new ChromosomeSplitter(HalvadeConf.getBedRegions(context.getConfiguration()),
            context.getConfiguration());
    keepChrSplitPairs = HalvadeConf.getkeepChrSplitPairs(context.getConfiguration());
    keep = HalvadeConf.getKeepFiles(context.getConfiguration());
}

From source file:be.ugent.intec.halvade.tools.STARInstance.java

License:Open Source License

private void emitJSFile(String starOutDir, Mapper.Context context) throws InterruptedException {
    SAMSequenceDictionary dict = null;// www  .jav  a 2  s  .  c  o m
    try {
        dict = HalvadeConf.getSequenceDictionary(context.getConfiguration());
    } catch (IOException ex) {
        Logger.EXCEPTION(ex);
        throw new InterruptedException("Error getting the SAMSequenceDictionary for SJ processing");
    }
    BufferedReader br = null;
    val = new Text();
    sj = new GenomeSJ();
    try {
        br = new BufferedReader(new FileReader(starOutDir + "/SJ.out.tab"));
        String line = br.readLine();
        sj.parseSJString(line, dict);
        while (line != null) {
            val.set(line);
            context.write(sj, val);
            line = br.readLine();
        }
    } catch (IOException | InterruptedException ex) {
        Logger.EXCEPTION(ex);
    } finally {
        if (br != null) {
            try {
                br.close();
            } catch (IOException ex) {
                Logger.EXCEPTION(ex);
            }
        }
    }
}

From source file:bigsatgps.BigDataHandler.java

License:Open Source License

/**
 *
 * @param infile//from  w w w  .ja  v  a2s  . c  o m
 * @return
 * @throws Exception
 */
public String ImageToSequence(String infile) throws Exception {
    String log4jConfPath = "lib/log4j.properties";
    PropertyConfigurator.configure(log4jConfPath);
    confHadoop = new Configuration();
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/core-site.xml"));
    confHadoop.addResource(new Path("/hadoop/projects/hadoop-1.0.4/conf/hdfs-site.xml"));
    FileSystem fs = FileSystem.get(confHadoop);
    Path inPath = new Path(infile);
    String outfile = infile.substring(0, infile.indexOf(".")) + ".seq";
    Path outPath = new Path(outfile);
    System.out.println();
    System.out.println("Successfully created the sequencefile " + outfile);
    FSDataInputStream in = null;
    Text key = new Text();
    BytesWritable value = new BytesWritable();
    SequenceFile.Writer writer = null;
    try {
        in = fs.open(inPath);
        byte buffer[] = new byte[in.available()];
        in.read(buffer);
        writer = SequenceFile.createWriter(fs, confHadoop, outPath, key.getClass(), value.getClass());
        writer.append(new Text(inPath.getName()), new BytesWritable(buffer));
        IOUtils.closeStream(writer);
        return outfile;
    } catch (IOException e) {
        System.err.println("Exception MESSAGES = " + e.getMessage());
        IOUtils.closeStream(writer);
        return null;
    }
}

From source file:bixo.examples.crawl.MultiDomainUrlFilter.java

License:Apache License

public MultiDomainUrlFilter(Path filterFile) throws Exception {
    //we could require a filter file and put these in all urls or leave them here
    _suffixExclusionPattern = Pattern.compile("(?i)\\.(pdf|zip|gzip|gz|sit|bz|bz2|tar|tgz|exe)$");
    _protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");

    JobConf conf = HadoopUtils.getDefaultJobConf();
    try {//process the file passed in
        if (filterFile != null) {
            FileSystem fs = filterFile.getFileSystem(conf);
            if (fs.exists(filterFile)) {
                FSDataInputStream in = fs.open(filterFile);
                LineReader lr = new LineReader(in);
                Text tmpStr = new Text();
                while (lr.readLine(tmpStr) > 0 && !tmpStr.toString().equals("")) {//skip blank lines
                    String p = tmpStr.toString().trim();//remove whitespace
                    if (p.substring(0, 1).equals("+")) {// '+' means do-crawl
                        ArrayList filterPair = new ArrayList();
                        filterPair.add((Boolean) true);
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } else if (p.substring(0, 1).equals("-")) {// '-' means filter out
                        ArrayList filterPair = new ArrayList();
                        filterPair.add(new Boolean(false));
                        filterPair.add(Pattern.compile(p.substring(1, p.length())));
                        _filters.add(filterPair);
                    } // otherwise a comment or malformed filter pattern
                }//ww w  . j a  v  a 2  s.c  om
            }
        }

    } catch (Exception e) {
        //any cleanup here? This would indicate a file system error, most likely
        throw e;
    }
}

From source file:bixo.examples.crawl.RegexUrlFilter.java

License:Apache License

public static List<String> getUrlFilterPatterns(String urlFiltersFile)
        throws IOException, InterruptedException {
    //this reads regex filters from a file in HDFS or the native file system
    JobConf conf = HadoopUtils.getDefaultJobConf();
    Path filterFile = new Path(urlFiltersFile);
    FileSystem fs = filterFile.getFileSystem(conf);
    List<String> filterList = new ArrayList<String>();
    LOGGER.info("Looking for file: " + urlFiltersFile);
    if (fs.exists(filterFile)) {
        FSDataInputStream in = fs.open(filterFile);
        LineReader reader = new LineReader(in);
        Text tLine = new Text();
        while (reader.readLine(tLine) > 0) {
            String line = tLine.toString();
            if (StringUtils.isNotBlank(line)
                    && (line.startsWith(INCLUDE_CHAR) || line.startsWith(EXCLUDE_CHAR))) {
                filterList.add(line.trim());
            }//from w  ww.j a  v a  2 s. com
        }
        in.close();
    } else {
        LOGGER.info("Can't find file: " + urlFiltersFile);
    }
    return filterList;
}

From source file:boa.datagen.SeqProjectCombiner.java

License:Apache License

public static void main(String[] args) throws IOException {
    Configuration conf = new Configuration();
    conf.set("fs.default.name", "hdfs://boa-njt/");
    FileSystem fileSystem = FileSystem.get(conf);
    String base = conf.get("fs.default.name", "");

    HashMap<String, String> sources = new HashMap<String, String>();
    HashSet<String> marks = new HashSet<String>();
    FileStatus[] files = fileSystem.listStatus(new Path(base + "tmprepcache/2015-07"));
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];/*from w  ww.j ava 2 s. c o m*/
        String name = file.getPath().getName();
        if (name.startsWith("projects-") && name.endsWith(".seq")) {
            System.out.println("Reading file " + i + " in " + files.length + ": " + name);
            SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
            final Text key = new Text();
            final BytesWritable value = new BytesWritable();
            try {
                while (r.next(key, value)) {
                    String s = key.toString();
                    if (marks.contains(s))
                        continue;
                    Project p = Project
                            .parseFrom(CodedInputStream.newInstance(value.getBytes(), 0, value.getLength()));
                    if (p.getCodeRepositoriesCount() > 0 && p.getCodeRepositories(0).getRevisionsCount() > 0)
                        marks.add(s);
                    sources.put(s, name);
                }
            } catch (Exception e) {
                System.err.println(name);
                e.printStackTrace();
            }
            r.close();
        }
    }
    SequenceFile.Writer w = SequenceFile.createWriter(fileSystem, conf,
            new Path(base + "repcache/2015-07/projects.seq"), Text.class, BytesWritable.class);
    for (int i = 0; i < files.length; i++) {
        FileStatus file = files[i];
        String name = file.getPath().getName();
        if (name.startsWith("projects-") && name.endsWith(".seq")) {
            System.out.println("Reading file " + i + " in " + files.length + ": " + name);
            SequenceFile.Reader r = new SequenceFile.Reader(fileSystem, file.getPath(), conf);
            final Text key = new Text();
            final BytesWritable value = new BytesWritable();
            try {
                while (r.next(key, value)) {
                    String s = key.toString();
                    if (sources.get(s).equals(name))
                        w.append(key, value);
                }
            } catch (Exception e) {
                System.err.println(name);
                e.printStackTrace();
            }
            r.close();
        }
    }
    w.close();

    fileSystem.close();
}