Example usage for org.apache.commons.io LineIterator next

List of usage examples for org.apache.commons.io LineIterator next

Introduction

In this page you can find the example usage for org.apache.commons.io LineIterator next.

Prototype

public Object next() 

Source Link

Document

Returns the next line in the wrapped Reader.

Usage

From source file:com.adobe.acs.tools.csv_asset_importer.impl.CsvAssetImporterServlet.java

/**
 * Adds a populated terminating field to the ends of CSV entries.
 * If the last entry in a CSV row is empty, the CSV library has difficulty understanding that is the end of the row.
 *
 * @param is        the CSV file as an inputstream
 * @param separator The field separator// www  .j  a  v a 2 s.co m
 * @param charset   The charset
 * @return An inputstream that is the same as is, but each line has a populated line termination entry
 * @throws IOException
 */
private InputStream terminateLines(final InputStream is, final char separator, final String charset)
        throws IOException {

    final ByteArrayOutputStream baos = new ByteArrayOutputStream();
    final PrintStream printStream = new PrintStream(baos);

    final LineIterator lineIterator = IOUtils.lineIterator(is, charset);

    while (lineIterator.hasNext()) {
        String line = StringUtils.stripToNull(lineIterator.next());

        if (line != null) {
            line += separator + TERMINATED;
            printStream.println(line);
        }
    }

    return new ByteArrayInputStream(baos.toByteArray());
}

From source file:de.tudarmstadt.lt.seg.app.Segmenter.java

private void run_sequential_line() throws Exception {
    ISentenceSplitter sentenceSplitter = newSentenceSplitter();
    ITokenizer tokenizer = newTokenizer();

    InputStream in = System.in;
    if (!"-".equals(_filename_in))
        in = new FileInputStream(_filename_in);
    LineIterator liter = new LineIterator(
            new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())));

    OutputStream out = System.out;
    if (!"-".equals(_filename_out))
        out = new FileOutputStream(_filename_out);
    PrintWriter w = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset()));

    for (long lc = 0; liter.hasNext();) {
        if (++lc % 1000 == 0)
            System.err.format("Processing line %d ('%s')%n", lc, _filename_in);
        String l = liter.next().replace("\\t", "\t").replace("\\n", "\n");
        split_and_tokenize(new StringReader(l), String.format("%s:%d", _filename_in, lc), sentenceSplitter,
                tokenizer, _level_filter, _level_normalize, _merge_types, _merge_tokens, _separator_sentence,
                _separator_token, _separator_desc, w);
    }//from   ww w.j a va  2s.co  m
}

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

public void create_vocabulary_index(File vocabulary_file) throws IOException {
    File index_dir = new File(_index_dir, "vocab");
    if (index_dir.exists()) {
        LOG.info("Vocabulary index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();/*from w w w  . ja v a  2s.  com*/
        } else
            return;
    }
    index_dir.mkdirs();
    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    iwc.setRAMBufferSizeMB(1024.0);
    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_vocab = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(vocabulary_file);
    if (vocabulary_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));
    Document doc = new Document();
    Field f_word = new StringField("word", "", Field.Store.YES);
    doc.add(f_word);
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 10000 == 0)
            LOG.info("Adding {}'th word.", c);
        String line = iter.next();
        try {
            String word = line.trim();
            f_word.setStringValue(word);
            writer_vocab.addDocument(doc);
        } catch (Exception e) {
            LOG.warn("Could not process line '{}' in file '{}', malformed line.", line, vocabulary_file, e);
        }
    }

    writer_vocab.forceMergeDeletes();
    writer_vocab.commit();
    writer_vocab.close();
}

From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv3Reader.java

/**
 * Iterate through lines and create span annotations accordingly. For
 * multiple span annotation, based on the position of the annotation in the
 * line, update only the end position of the annotation
 *///w w  w .  j  a v  a2 s  .  c  o m
private void setAnnotations(JCas aJCas, InputStream aIs, String aEncoding) throws IOException {

    // getting header information
    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    int sentBegin = -1, sentEnd = 0;
    int prevSentEnd = 0;
    StringBuilder sentLineSb = new StringBuilder();
    String lastSent = "";
    while (lineIterator.hasNext()) {
        String line = lineIterator.next();
        if (line.startsWith("#T_")) {
            setLayerAndFeature(aJCas, line);
            continue;
        }

        if (line.startsWith("#Text=")) {
            if (sentLineSb.toString().isEmpty()) {
                sentLineSb.append(line.substring(line.indexOf("=") + 1));
            } else {
                sentLineSb.append(LF + line.substring(line.indexOf("=") + 1));
            }
            lastSent = sentLineSb.toString();
            continue;
        }
        if (line.startsWith("#FORMAT=")) {
            continue;
        }
        if (line.trim().isEmpty()) {
            if (!sentLineSb.toString().isEmpty()) {
                createSentence(aJCas, sentLineSb.toString(), sentBegin, sentEnd, prevSentEnd);
                prevSentEnd = sentEnd;
                sentBegin = -1;// reset for next sentence begin
                sentLineSb = new StringBuilder();
            }

            continue;
        }

        line = line.trim();
        int count = StringUtils.countMatches(line, "\t");

        if (columns != count) {
            throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
        }

        String regex = "(?<!\\\\)*" + Pattern.quote(TAB);
        String[] lines = line.split(regex);

        int begin = Integer.parseInt(lines[1].split("-")[0]);
        int end = Integer.parseInt(lines[1].split("-")[1]);
        if (sentBegin == -1) {
            sentBegin = begin;
        }
        sentEnd = end;

        AnnotationUnit unit = createTokens(aJCas, lines, begin, end);

        int ind = 3;

        setAnnosPerTypePerUnit(lines, unit, ind);
    }

    // the last sentence
    if (!lastSent.isEmpty()) {
        createSentence(aJCas, lastSent, sentBegin, sentEnd, prevSentEnd);
    }

    Map<Type, Map<AnnotationUnit, List<AnnotationFS>>> annosPerTypePerUnit = new HashMap<>();
    setAnnosPerUnit(aJCas, annosPerTypePerUnit);
    addAnnotations(aJCas, annosPerTypePerUnit);
    addChainAnnotations(aJCas);
}

From source file:edu.cornell.med.icb.goby.modes.EmpiricalPMode.java

private void scan() throws FileNotFoundException {
    LineIterator iterator = new LineIterator(new FastBufferedReader(new FileReader(inputFilename)));
    int lineNumber = 0;
    ObjectArrayList<String> elementIds = new ObjectArrayList<String>();

    IntArrayList valuesA = new IntArrayList();

    IntArrayList valuesB = new IntArrayList();
    IntArrayList covariatesA = new IntArrayList();
    IntArrayList covariatesB = new IntArrayList();

    counter = new FormatFieldCounter(0, 2, 2, new String[] { "ALL" });
    setupOutput();//from   w  ww. java 2  s .com
    // ignore the header line:
    iterator.next();
    ProgressLogger pg = new ProgressLogger(LOG);
    pg.displayFreeMemory = true;
    pg.itemsName = "pairs";
    pg.expectedUpdates = countLines(inputFilename) - 1;
    pg.start("Starting to scan pairs.");
    while (iterator.hasNext()) {
        String next = iterator.nextLine();
        String[] tokens = next.split("\t");
        boolean pastIds = false;
        boolean pastValues = false;

        String typeOfPairString = tokens[0];
        ObservationWriter.TypeOfPair typeOfPair = ObservationWriter.TypeOfPair.UNDEFINED;

        for (int i = 0; i < tokens.length; i++) {

            try {
                typeOfPair = ObservationWriter.TypeOfPair.valueOf(typeOfPairString);
            } catch (IllegalArgumentException e) {
                System.err.println(
                        "First token of every line should be WITHIN_GROUP_PAIR or BETWEEN_GROUP_PAIR. Found "
                                + typeOfPairString + " on line " + lineNumber);
                System.exit(1);
            }

            elementIds.clear();
            valuesA.clear();
            valuesB.clear();
            covariatesA.clear();
            covariatesB.clear();
            int j;
            String groupComparison = tokens[1];
            elementIds.add(groupComparison);
            for (j = 2; !"VALUES_A".equals(tokens[j]); j++) {
                if (j == tokens.length) {
                    break;
                }
                elementIds.add(tokens[j]);
            }
            if (j == tokens.length) {
                System.err.println(
                        "Every line must contain the VALUES keyword. Keyword not found on line " + lineNumber);
                System.exit(1);
            }
            j++;
            for (; !"VALUES_B".equals(tokens[j]); j++) {
                if (j == tokens.length) {
                    break;
                }
                valuesA.add(Integer.parseInt(tokens[j]));
            }
            j++;
            for (; !"COVARIATES_A".equals(tokens[j]); j++) {
                if (j == tokens.length) {
                    break;
                }
                valuesB.add(Integer.parseInt(tokens[j]));
            }
            if (j == tokens.length) {
                System.err
                        .println("Every line must contain the COVARIATES_A keyword. Keyword not found on line "
                                + lineNumber);
                System.exit(1);
            }
            j++;
            for (; !"COVARIATES_B".equals(tokens[j]); j++) {
                if (j == tokens.length) {
                    break;
                }
                covariatesA.add(Integer.parseInt(tokens[j]));
            }
            if (j == tokens.length) {
                System.err
                        .println("Every line must contain the COVARIATES_B keyword. Keyword not found on line "
                                + lineNumber);
                System.exit(1);
            }
            j++;
            for (; j < tokens.length; j++) {
                covariatesB.add(Integer.parseInt(tokens[j]));
            }

        }
        lineNumber++;
        final String groupComparison = elementIds.get(0);
        process(typeOfPair, groupComparison, elementIds, valuesA, valuesB, covariatesA, covariatesB);
        pg.lightUpdate();
    }
    pg.done(lineNumber);
}

From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoCustomTsvReader.java

/**
 * Iterate through lines and create span annotations accordingly. For multiple span annotation,
 * based on the position of the annotation in the line, update only the end position of the
 * annotation//from   w  w w . j  av a  2s  .co m
 */
private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text)
        throws IOException {

    // getting header information
    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    int columns = 1;// token number + token columns (minimum required)
    int tokenStart = 0, sentenceStart = 0;
    Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>();
    Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>();

    // an annotation for every feature in a layer
    Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>();

    // store if this is a Begin/Intermediate/End of an annotation
    Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>();

    // Store annotations of tokens so that it can be used later for relation annotations
    Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>();

    // store target token ids used for a relation
    Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>();

    // store tokens indexing with the concat of itsbegin-end so that lemma and pos annotation
    // can be attached, if exists, later
    indexedTokens = new HashMap<String, Token>();

    while (lineIterator.hasNext()) {
        String line = lineIterator.next().trim();
        if (line.trim().equals("") && sentenceStart == tokenStart) {
            continue;
        }
        if (line.trim().equals("")) {
            text.replace(tokenStart - 1, tokenStart, "");
            tokenStart = tokenStart - 1;
            Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
            sentence.addToIndexes();
            tokenStart++;
            sentenceStart = tokenStart;
            text.append("\n");
            continue;
        }
        // sentence
        if (line.startsWith("#text=")) {
            continue;
        }
        if (line.startsWith("#id=")) {
            continue;// it is a comment line
        }
        if (line.startsWith("#")) {
            columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line);
            continue;
        }
        // some times, the sentence in #text= might have a new line which break this reader,
        // so skip such lines
        if (!Character.isDigit(line.split(" ")[0].charAt(0))) {
            continue;
        }

        // If we are still unlucky, the line starts with a number from the sentence but not
        // a token number, check if it didn't in the format NUM-NUM
        if (!Character.isDigit(line.split("-")[1].charAt(0))) {
            continue;
        }

        int count = StringUtils.countMatches(line, "\t");

        if (columns != count) {
            throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
        }

        // adding tokens and sentence
        StringTokenizer lineTk = new StringTokenizer(line, "\t");
        String tokenNumberColumn = lineTk.nextToken();
        String tokenColumn = lineTk.nextToken();
        Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length());
        token.addToIndexes();
        Type posType = JCasUtil.getType(aJcas, POS.class);
        Type lemmaType = JCasUtil.getType(aJcas, Lemma.class);
        if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) {
            indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token);
        }

        // adding the annotations
        createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno,
                tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn);

        tokenStart = tokenStart + tokenColumn.length() + 1;
        text.append(tokenColumn + " ");
    }
    if (tokenStart > sentenceStart) {
        Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
        sentence.addToIndexes();
        text.append("\n");
    }

    createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets);
}

From source file:de.tudarmstadt.ukp.clarin.webanno.tsv.WebannoTsv2Reader.java

/**
 * Iterate through lines and create span annotations accordingly. For
 * multiple span annotation, based on the position of the annotation in the
 * line, update only the end position of the annotation
 *//*from w  w w. ja  va2 s  .c  o  m*/
private void setAnnotations(JCas aJcas, InputStream aIs, String aEncoding, StringBuilder text)
        throws IOException {

    // getting header information
    LineIterator lineIterator = IOUtils.lineIterator(aIs, aEncoding);
    int columns = 1;// token number + token columns (minimum required)
    int tokenStart = 0, sentenceStart = 0;
    Map<Type, Set<Feature>> spanLayers = new LinkedHashMap<Type, Set<Feature>>();
    Map<Type, Type> relationayers = new LinkedHashMap<Type, Type>();

    // an annotation for every feature in a layer
    Map<Type, Map<Integer, AnnotationFS>> annotations = new LinkedHashMap<Type, Map<Integer, AnnotationFS>>();

    // store if this is a Begin/Intermediate/End of an annotation
    Map<Type, Map<Integer, String>> beginEndAnno = new LinkedHashMap<Type, Map<Integer, String>>();

    // Store annotations of tokens so that it can be used later for relation
    // annotations
    Map<Type, Map<String, List<AnnotationFS>>> tokenAnnotations = new LinkedHashMap<Type, Map<String, List<AnnotationFS>>>();

    // store target token ids used for a relation
    Map<Type, Map<String, List<String>>> relationTargets = new LinkedHashMap<Type, Map<String, List<String>>>();

    // store tokens indexing with the concat of itsbegin-end so that lemma
    // and pos annotation
    // can be attached, if exists, later
    indexedTokens = new HashMap<String, Token>();

    while (lineIterator.hasNext()) {
        String line = lineIterator.next().trim();
        if (line.trim().equals("") && sentenceStart == tokenStart) {
            continue;
        }
        if (line.trim().equals("")) {
            text.replace(tokenStart - 1, tokenStart, "");
            tokenStart = tokenStart - 1;
            Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
            sentence.addToIndexes();
            tokenStart++;
            sentenceStart = tokenStart;
            text.append("\n");
            continue;
        }
        // sentence
        if (line.startsWith("#text=")) {
            continue;
        }
        if (line.startsWith("#id=")) {
            continue;// it is a comment line
        }
        if (line.startsWith("#")) {
            columns = getLayerAndFeature(aJcas, columns, spanLayers, relationayers, line);
            continue;
        }
        // some times, the sentence in #text= might have a new line which
        // break this reader,
        // so skip such lines
        if (!Character.isDigit(line.split(" ")[0].charAt(0))) {
            continue;
        }

        // If we are still unlucky, the line starts with a number from the
        // sentence but not
        // a token number, check if it didn't in the format NUM-NUM
        if (!Character.isDigit(line.split("-")[1].charAt(0))) {
            continue;
        }

        int count = StringUtils.countMatches(line, "\t");

        if (columns != count) {
            throw new IOException(fileName + " This is not a valid TSV File. check this line: " + line);
        }

        // adding tokens and sentence
        StringTokenizer lineTk = new StringTokenizer(line, "\t");
        String tokenNumberColumn = lineTk.nextToken();
        String tokenColumn = lineTk.nextToken();
        Token token = new Token(aJcas, tokenStart, tokenStart + tokenColumn.length());
        token.addToIndexes();
        Type posType = JCasUtil.getType(aJcas, POS.class);
        Type lemmaType = JCasUtil.getType(aJcas, Lemma.class);
        if (spanLayers.containsKey(posType) || spanLayers.containsKey(lemmaType)) {
            indexedTokens.put(tokenStart + "-" + tokenStart + tokenColumn.length(), token);
        }

        // adding the annotations
        createSpanAnnotation(aJcas, tokenStart, spanLayers, relationayers, annotations, beginEndAnno,
                tokenAnnotations, relationTargets, lineTk, tokenColumn, tokenNumberColumn);

        tokenStart = tokenStart + tokenColumn.length() + 1;
        text.append(tokenColumn + " ");
    }
    if (tokenStart > sentenceStart) {
        Sentence sentence = new Sentence(aJcas, sentenceStart, tokenStart);
        sentence.addToIndexes();
        text.append("\n");
    }

    createRelationLayer(aJcas, relationayers, tokenAnnotations, relationTargets);
}

From source file:de.tudarmstadt.ukp.clarin.webanno.api.dao.RepositoryServiceDbData.java

/**
 * Check if a TAB-Sep training file is in correct format before importing
 *//*from  w w  w. j  a va2 s .c om*/
private boolean isTabSepFileFormatCorrect(File aFile) {
    try {
        LineIterator it = new LineIterator(new FileReader(aFile));
        while (it.hasNext()) {
            String line = it.next();
            if (line.trim().length() == 0) {
                continue;
            }
            if (line.split("\t").length != 2) {
                return false;
            }
        }
    } catch (Exception e) {
        return false;
    }
    return true;
}

From source file:dk.netarkivet.archive.arcrepositoryadmin.ReplicaCacheDatabase.java

/**
 * Method for adding the results from a list of filenames on a replica. This list of filenames should return the
 * list of all the files within the database.
 * <p>/*from w  w  w . ja v a  2s .  c  om*/
 * For each file in the FileListJob the following fields are set for the corresponding entry in the replicafileinfo
 * table: <br/>
 * - filelist_status = ok. <br/>
 * - filelist_checkdatetime = now.
 * <p>
 * For each entry in the replicafileinfo table for the replica which are missing in the results from the FileListJob
 * the following fields are assigned the following values: <br/>
 * - filelist_status = missing. <br/>
 * - filelist_checkdatetime = now.
 *
 * @param filelistFile The list of filenames either parsed from a FilelistJob or the result from a
 * GetAllFilenamesMessage.
 * @param replica The replica, which the FilelistBatchjob has run upon.
 * @throws ArgumentNotValid If the filelist or the replica is null.
 * @throws UnknownID If the replica does not already exist in the database.
 */
@Override
public void addFileListInformation(File filelistFile, Replica replica) throws ArgumentNotValid, UnknownID {
    ArgumentNotValid.checkNotNull(filelistFile, "File filelistFile");
    ArgumentNotValid.checkNotNull(replica, "Replica replica");

    // Sort the filelist file.
    File sortedResult = new File(filelistFile.getParent(), filelistFile.getName() + ".sorted");
    FileUtils.sortFile(filelistFile, sortedResult);
    final long datasize = FileUtils.countLines(sortedResult);

    Connection con = ArchiveDBConnection.get();
    Set<Long> missingReplicaRFIs = null;
    LineIterator lineIterator = null;
    try {
        // Make sure, that the replica exists in the database.
        if (!ReplicaCacheHelpers.existsReplicaInDB(replica, con)) {
            String errorMsg = "Cannot add filelist information, since the replica '" + replica.toString()
                    + "' does not exist in the database.";
            log.warn(errorMsg);
            throw new UnknownID(errorMsg);
        }

        log.info("Starting processing of {} filelist entries for replica {}", datasize, replica.getId());

        // retrieve the list of files already known by this cache.
        // TODO This does not scale! Should this datastructure
        // (missingReplicaRFIs) be disk-bound in some way.
        missingReplicaRFIs = ReplicaCacheHelpers.retrieveReplicaFileInfoGuidsForReplica(replica.getId(), con);

        // Initialize String iterator
        lineIterator = new LineIterator(new FileReader(sortedResult));

        String lastFileName = "";
        int i = 0;
        while (lineIterator.hasNext()) {
            String file = lineIterator.next();
            // log that it is in progress every so often.
            if ((i % LOGGING_ENTRY_INTERVAL) == 0) {
                log.info("Processed file list entry number {} for replica {}", i, replica);
                // Close connection, and open another one
                // to avoid memory-leak (NAS-2003)
                ArchiveDBConnection.release(con);
                con = ArchiveDBConnection.get();
                log.debug("Databaseconnection has now been renewed");
            }
            ++i;

            // handle duplicates.
            if (file.equals(lastFileName)) {
                log.warn("There have been found multiple files with the name '{}'", file);
                continue;
            }

            lastFileName = file;
            // Add information for one file, and remove the ReplicaRFI from the
            // set of missing ones.
            missingReplicaRFIs.remove(ReplicaCacheHelpers.addFileInformation(file, replica, con));
        }
    } catch (IOException e) {
        throw new IOFailure("Unable to read the filenames from file", e);
    } finally {
        ArchiveDBConnection.release(con);
        LineIterator.closeQuietly(lineIterator);
    }

    con = ArchiveDBConnection.get();
    try {
        // go through the not found replicafileinfo for this replica to change
        // their filelist_status to missing.
        if (missingReplicaRFIs.size() > 0) {
            log.warn("Found {} missing files for replica '{}'.", missingReplicaRFIs.size(), replica);
            for (long rfi : missingReplicaRFIs) {
                // set the replicafileinfo in the database to missing.
                ReplicaCacheHelpers.updateReplicaFileInfoMissingFromFilelist(rfi, con);
            }
        }
        // Update the date for filelist update for this replica.
        ReplicaCacheHelpers.updateFilelistDateForReplica(replica, con);
    } finally {
        ArchiveDBConnection.release(con);
    }
}

From source file:de.tudarmstadt.lt.lm.app.GenerateNgramIndex.java

public void create_ngram_index(File ngram_joined_counts_file) throws IOException {
    File index_dir = new File(_index_dir, "ngram");
    if (index_dir.exists()) {
        LOG.info("Ngram index already exists in directory '{}'.", index_dir.getAbsolutePath());
        if (_overwrite) {
            LOG.info("Overwriting index '{}',", index_dir);
            index_dir.delete();// ww w.j a  v  a2  s . c  om
        } else
            return;
    }
    index_dir.mkdirs();

    Analyzer analyzer = new KeywordAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer);
    iwc.setOpenMode(OpenMode.CREATE);
    // use 80 percent of the available total memory
    double total_mem_mb = (double) Runtime.getRuntime().maxMemory() / 1e6;
    double percentage_ram_buffer = Properties.ramBufferPercentage();
    if (percentage_ram_buffer > 0) {
        double percentage_ram_buffer_mb = total_mem_mb * percentage_ram_buffer;
        LOG.info(String.format("Setting ram buffer size to %.2f MB (%.2f%% from %.2f MB)",
                percentage_ram_buffer_mb, percentage_ram_buffer * 100, total_mem_mb));
        iwc.setRAMBufferSizeMB(percentage_ram_buffer_mb);
    }

    Directory directory = new MMapDirectory(index_dir);
    IndexWriter writer_ngram = new IndexWriter(directory, iwc);

    InputStream in = new FileInputStream(ngram_joined_counts_file);
    if (ngram_joined_counts_file.getName().endsWith(".gz"))
        in = new GZIPInputStream(in);
    LineIterator iter = new LineIterator(new BufferedReader(new InputStreamReader(in, "UTF-8")));

    Document doc = new Document();
    Field f_ngram = new StringField("ngram", "", Store.YES);
    doc.add(f_ngram);
    Field f_n = new IntField("cardinality", 0, Store.YES);
    doc.add(f_n);
    Field f_word = new StringField("word", "", Store.YES);
    doc.add(f_word);
    Field f_hist = new StringField("history", "", Store.YES);
    doc.add(f_hist);
    Field f_lower = new StringField("lower", "", Store.YES);
    doc.add(f_lower);
    Field f_count = new StoredField("num", 0L);
    doc.add(f_count);

    Field[] f_follow = new Field[4];
    f_follow[0] = new StoredField("nf_s", 0L);
    doc.add(f_follow[0]);
    f_follow[1] = new StoredField("nf_N1", 0L);
    doc.add(f_follow[1]);
    f_follow[2] = new StoredField("nf_N2", 0L);
    doc.add(f_follow[2]);
    f_follow[3] = new StoredField("nf_N3", 0L);
    doc.add(f_follow[3]);
    Field[] f_precede = new Field[4];
    f_precede[0] = new StoredField("np_s", 0L);
    doc.add(f_precede[0]);
    f_precede[1] = new StoredField("np_N1", 0L);
    doc.add(f_precede[1]);
    f_precede[2] = new StoredField("np_N2", 0L);
    doc.add(f_precede[2]);
    f_precede[3] = new StoredField("np_N3", 0L);
    doc.add(f_precede[3]);
    Field[] f_followerprecede = new Field[4];
    f_followerprecede[0] = new StoredField("nfp_s", 0L);
    doc.add(f_followerprecede[0]);
    f_followerprecede[1] = new StoredField("nfp_N1", 0L);
    doc.add(f_followerprecede[1]);
    f_followerprecede[2] = new StoredField("nfp_N2", 0L);
    doc.add(f_followerprecede[2]);
    f_followerprecede[3] = new StoredField("nfp_N3", 0L);
    doc.add(f_followerprecede[3]);

    Long[][] N = new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } };
    Long[] S = new Long[] { 0L };
    long c = 0;
    while (iter.hasNext()) {
        if (++c % 100000 == 0)
            LOG.info("Adding {}'th ngram.", c);
        String line = iter.next();
        try {
            String[] splits = de.tudarmstadt.lt.utilities.StringUtils.rtrim(line).split("\t");
            String ngram_str = splits[0];
            if (de.tudarmstadt.lt.utilities.StringUtils.trim(ngram_str).isEmpty()) {
                LOG.warn("Ngram is empty, skipping line {}: '{}' (file '{}').", c, line,
                        ngram_joined_counts_file);
                continue;
            }

            List<String> ngram = Arrays.asList(ngram_str.split(" "));
            long num = Long.parseLong(splits[1]);
            int n = ngram.size();

            f_ngram.setStringValue(ngram_str);
            f_n.setIntValue(n);
            f_word.setStringValue(ngram.get(ngram.size() - 1));
            f_hist.setStringValue(StringUtils.join(ngram.subList(0, ngram.size() - 1), " "));
            f_lower.setStringValue(StringUtils.join(ngram.subList(1, ngram.size()), " "));
            f_count.setLongValue(num);

            for (int j = 0; j < f_follow.length; j++) {
                f_follow[j].setLongValue(0L);
                f_precede[j].setLongValue(0L);
                f_followerprecede[j].setLongValue(0L);
            }

            if (splits.length > 2 && !splits[2].isEmpty()) {
                // precede or follow or followerprecede
                String[] splits_ = splits[2].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 3 && !splits[3].isEmpty()) {
                // should be follow or followerprecede
                String[] splits_ = splits[3].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }
            if (splits.length > 4 && !splits[4].isEmpty()) {
                // should be followerprecede
                String[] splits_ = splits[4].split(":");
                String type = splits_[0];
                String[] count_values = splits_[1].split(",");
                if (count_values.length > 0) {
                    if ("n_f".equals(type))
                        f_follow[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_p".equals(type))
                        f_precede[0].setLongValue(Long.parseLong(count_values[0]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[0].setLongValue(Long.parseLong(count_values[0]));
                }
                for (int i = 1; i < count_values.length; i++) {
                    if ("n_f".equals(type))
                        f_follow[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_p".equals(type))
                        f_precede[i].setLongValue(Long.parseLong(count_values[i]));
                    else if ("n_fp".equals(type))
                        f_followerprecede[i].setLongValue(Long.parseLong(count_values[i]));
                }
            }

            writer_ngram.addDocument(doc);

            while (N.length <= n) {
                N = ArrayUtils.getConcatinatedArray(N, new Long[][] { { 0L, 0L, 0L, 0L, 0L, 0L } });
                S = ArrayUtils.getConcatinatedArray(S, new Long[] { 0L });
            }

            if (num == 1L)
                N[n][1]++;
            else if (num == 2L)
                N[n][2]++;
            else if (num == 3L)
                N[n][3]++;
            else if (num == 4L)
                N[n][4]++;
            else
                N[n][5]++;
            N[n][0]++;
            S[n] += num;

        } catch (Exception e) {
            LOG.error("Could not process line '{}' in file '{}:{}', malformed line.", line,
                    ngram_joined_counts_file, c, e);
        }
    }

    writer_ngram.forceMergeDeletes();
    writer_ngram.commit();
    writer_ngram.close();

    StringBuilder b = new StringBuilder(String.format(
            "#%n# Number of times where an ngram occurred: %n#  at_least_once, exactly_once, exactly_twice, exactly_three_times, exactly_four_times, five_times_or_more.%n#%nmax_n=%d%nmax_c=6%n",
            N.length - 1));
    for (int n = 1; n < N.length; n++)
        b.append(String.format("n%d=%s%n", n, StringUtils.join(N[n], ',')));
    for (int n = 1; n < S.length; n++)
        b.append(String.format("s%d=%d%n", n, S[n]));
    FileUtils.writeStringToFile(new File(_index_dir, "__sum_ngrams__"), b.toString());

}