de.tudarmstadt.lt.seg.app.Segmenter.java Source code

Introduction

Here is the source code for de.tudarmstadt.lt.seg.app.Segmenter.java
Source

/*
 *   Copyright 2014
 *
 *   Licensed under the Apache License, Version 2.0 (the "License");
 *   you may not use this file except in compliance with the License.
 *   You may obtain a copy of the License at
 *
 *       http://www.apache.org/licenses/LICENSE-2.0
 *
 *   Unless required by applicable law or agreed to in writing, software
 *   distributed under the License is distributed on an "AS IS" BASIS,
 *   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 *   See the License for the specific language governing permissions and
 *   limitations under the License.
 */
package de.tudarmstadt.lt.seg.app;

import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.StringReader;
import java.nio.charset.Charset;
import java.util.Spliterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.io.LineIterator;

import de.tudarmstadt.lt.seg.SegmentType;
import de.tudarmstadt.lt.seg.sentence.ISentenceSplitter;
import de.tudarmstadt.lt.seg.sentence.RuleSplitter;
import de.tudarmstadt.lt.seg.token.ITokenizer;
import de.tudarmstadt.lt.seg.token.RuleTokenizer;
import de.tudarmstadt.lt.utilities.cli.CliUtils;
import de.tudarmstadt.lt.utilities.cli.ExtendedGnuParser;

/**
 * @author Steffen Remus
 *
 */
@SuppressWarnings("static-access")
public class Segmenter implements Runnable {

    private static String USAGE_HEADER = String.format(
            "+++%nSplit sentences and tokenize documents. Supports piped input. %nUses default encoding and locale. Specify '-Dfile.encoding' for changing default encoding, specify '-Duser.language', '-Duser.country', '-Duser.script', '-Duser.variant' for changing default locale. E.g. '-Dfile.encoding=UTF-8 -Duser.language=en -Duser.country=US'! %nSupported RuleSets for RuleSplitter: %s %nSupported RuleSets for RuleTokenizer: %s%n+++%nOptions:",
            de.tudarmstadt.lt.seg.sentence.rules.RuleSet.getAvailable(),
            de.tudarmstadt.lt.seg.token.rules.RuleSet.getAvailable());

    private static boolean DEBUG = false;

    public static void main(String[] args) throws ClassNotFoundException {
        new Segmenter(args).run();
    }

    /**
     * default constructor
     * 
     * set necessary variables by using <code>new Segmenter(){{ _variable = value; ... }} </code>
     * 
     */
    public Segmenter() {
        /* NOTHING TO DO */}

    static Options opts;

    static {
        opts = new Options();
        opts.addOption(new Option("?", "help", false, "display this message"));

        opts.addOption(OptionBuilder.withLongOpt("file").withArgName("filename").hasArg()
                .withDescription(
                        "Specify the file you want to read from. Specify '-' to read from stdin. (default: '-').")
                .create("f"));
        opts.addOption(OptionBuilder.withLongOpt("out").withArgName("filename").hasArg()
                .withDescription(
                        "Specify the file you want to write to. Specify '-' to write to stdout. (default: '-').")
                .create("o"));
        opts.addOption(OptionBuilder.withLongOpt("sentence-separator").withArgName("separator").hasArg()
                .withDescription("Specify the separator for sentences. (default: '\\n').").create("seps"));
        opts.addOption(OptionBuilder.withLongOpt("token-separator").withArgName("separator").hasArg()
                .withDescription("Specify the separator for tokens. (default: ' ').").create("sept"));
        opts.addOption(OptionBuilder.withLongOpt("source-separator").withArgName("separator").hasArg()
                .withDescription("Specify the separator for the source description. (default: '\\t').")
                .create("sepd"));
        opts.addOption(OptionBuilder.withLongOpt("sentencesplitter").withArgName("class").hasArg().withDescription(
                "Specify the class of the sentence splitter that you want to use: {BreakSplitter, LineSplitter, RuleSplitter, NullSplitter} (default: RuleSplitter)")
                .create("s"));
        opts.addOption(OptionBuilder.withLongOpt("tokenizer").withArgName("class").hasArg().withDescription(
                "Specify the class of the word tokinzer that you want to use: {BreakTokenizer, DiffTokenizer, EmptySpaceTokenizer, NullTokenizer} (default: DiffTokenizer)")
                .create("t"));
        opts.addOption(OptionBuilder.withLongOpt("parallel").withArgName("num").hasArg().withDescription(
                "Specify the number of parallel threads. (Note: output might be genereated in a different order than provided by input, specify 1 if you need to keep the order. Parallel mode requires one document per line [ -l ] (default: 1).")
                .create());
        opts.addOption(OptionBuilder.withLongOpt("normalize")
                .withDescription("Specify the degree of token normalization [0...4] (default: 0).").hasArg()
                .withArgName("level").create("nl"));
        opts.addOption(OptionBuilder.withLongOpt("filter")
                .withDescription("Specify the degree of token filtering [0...5] (default: 2).").hasArg()
                .withArgName("level").create("fl"));
        opts.addOption(OptionBuilder.withLongOpt("merge")
                .withDescription("Specify the degree of merging conscutive items {0,1,2} (default: 0).")
                .hasOptionalArg().withArgName("level").create("ml"));
        opts.addOption(OptionBuilder.withLongOpt("onedocperline").withDescription(
                "Specify if you want to process documents linewise and preserve document ids, i.e. map line numbers to sentences.")
                .create("l"));
        opts.addOption(OptionBuilder.withLongOpt("sentence-ruleset").withArgName("languagecode").hasArg()
                .withDescription(String.format(
                        "Specify the ruleset that you want to use together with RuleSplitter (avaliable: %s) (default: 'default')",
                        de.tudarmstadt.lt.seg.sentence.rules.RuleSet.getAvailable()))
                .create());
        opts.addOption(OptionBuilder.withLongOpt("token-ruleset").withArgName("languagecode").hasArg()
                .withDescription(String.format(
                        "Specify the ruleset that you want to use together with RuleTokenizer (avaliable: %s) (default: 'default')",
                        de.tudarmstadt.lt.seg.token.rules.RuleSet.getAvailable()))
                .create());
        opts.addOption(OptionBuilder.withLongOpt("boundary-as-part-of-sentence")
                .withDescription(
                        "Specify if sentence boundaries should be part of the sentence segment (default: true).")
                .hasArg().withArgName("true|false").create("bps"));
        opts.addOption(OptionBuilder.withLongOpt("debug").withDescription("Enable debugging.").create());
    }

    public Segmenter(String[] args) {
        try {
            CommandLine cmd = new ExtendedGnuParser(true).parse(opts, args);
            if (cmd.hasOption("help"))
                CliUtils.print_usage_quit(System.err, Segmenter.class.getSimpleName(), opts, USAGE_HEADER, null, 0);

            _sentence_splitter_type = cmd.getOptionValue("sentencesplitter", RuleSplitter.class.getSimpleName());
            _tokenizer_type = cmd.getOptionValue("tokenizer", RuleTokenizer.class.getSimpleName());
            _filename_in = cmd.getOptionValue("file", "-");
            _filename_out = cmd.getOptionValue("out", "-");
            _separator_sentence = cmd.getOptionValue("seps", "\n");
            _separator_token = cmd.getOptionValue("sept", " ");
            _separator_desc = cmd.getOptionValue("sepd", "\t");

            _level_normalize = Integer.parseInt(cmd.getOptionValue("normalize", "0"));
            _level_filter = Integer.parseInt(cmd.getOptionValue("filter", "2"));

            int level_merge = cmd.hasOption("merge") ? 1 : 0;
            if (cmd.hasOption("merge") && cmd.getOptionValue("merge") != null)
                level_merge = Integer.parseInt(cmd.getOptionValue("merge", "1"));
            _merge_tokens = level_merge >= 2;
            _merge_types = level_merge >= 1;

            _parallelism = Integer.parseInt(cmd.getOptionValue("parallel", "1"));//Runtime.getRuntime().availableProcessors()
            _one_doc_per_line = cmd.hasOption("l");
            _ruleset_sentence = cmd.getOptionValue("sentence-ruleset");
            _ruleset_token = cmd.getOptionValue("token-ruleset");
            _boundary_as_part_of_sentence = Boolean
                    .parseBoolean(cmd.getOptionValue("boundary-as-part-of-sentence", "true"));

            DEBUG = cmd.hasOption("debug");
            if (DEBUG) {
                _separator_sentence = "\n";
                _separator_token = "\n";
                _separator_desc = "\n";
            }

        } catch (Exception e) {
            CliUtils.print_usage_quit(System.err, Segmenter.class.getSimpleName(), opts, USAGE_HEADER,
                    String.format("%s: %s%n", e.getClass().getSimpleName(), e.getMessage()), 1);
        }

    }

    int _level_normalize;
    int _level_filter;
    int _parallelism;
    String _filename_in;
    String _filename_out;
    String _tokenizer_type;
    String _sentence_splitter_type;
    String _separator_sentence;
    String _separator_token;
    String _separator_desc;
    String _ruleset_sentence;
    String _ruleset_token;
    boolean _one_doc_per_line;
    boolean _merge_types;
    boolean _merge_tokens;
    boolean _boundary_as_part_of_sentence;

    /* (non-Javadoc)
     * @see java.lang.Runnable#run()
     */
    @Override
    public void run() {
        System.err.println("Setting parallelism to " + _parallelism);
        System.err.format("Using '%s' and '%s'.%n", _tokenizer_type, _sentence_splitter_type);
        try {
            if (_parallelism > 1 && _one_doc_per_line)
                run_parallel();
            else if (_parallelism <= 1 && _one_doc_per_line)
                run_sequential_line();
            else
                run_sequential_stream();
        } catch (Exception e) {
            CliUtils.print_usage_quit(Segmenter.class.getSimpleName(), null,
                    String.format("%s: %s%n", e.getClass().getSimpleName(), e.getMessage()), 1);
        }

    }

    private void run_sequential_stream() throws Exception {
        ISentenceSplitter sentenceSplitter = newSentenceSplitter();
        ITokenizer tokenizer = newTokenizer();

        InputStream in = System.in;
        if (!"-".equals(_filename_in))
            in = new FileInputStream(_filename_in);
        BufferedReader r = new BufferedReader(new InputStreamReader(in, Charset.defaultCharset()));

        OutputStream out = System.out;
        if (!"-".equals(_filename_out))
            out = new FileOutputStream(_filename_out);
        PrintWriter w = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset()));

        split_and_tokenize(r, _filename_in, sentenceSplitter, tokenizer, _level_filter, _level_normalize,
                _merge_types, _merge_tokens, _separator_sentence, _separator_token, _separator_desc, w);

        r.close();
    }

    private void run_sequential_line() throws Exception {
        ISentenceSplitter sentenceSplitter = newSentenceSplitter();
        ITokenizer tokenizer = newTokenizer();

        InputStream in = System.in;
        if (!"-".equals(_filename_in))
            in = new FileInputStream(_filename_in);
        LineIterator liter = new LineIterator(
                new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())));

        OutputStream out = System.out;
        if (!"-".equals(_filename_out))
            out = new FileOutputStream(_filename_out);
        PrintWriter w = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset()));

        for (long lc = 0; liter.hasNext();) {
            if (++lc % 1000 == 0)
                System.err.format("Processing line %d ('%s')%n", lc, _filename_in);
            String l = liter.next().replace("\\t", "\t").replace("\\n", "\n");
            split_and_tokenize(new StringReader(l), String.format("%s:%d", _filename_in, lc), sentenceSplitter,
                    tokenizer, _level_filter, _level_normalize, _merge_types, _merge_tokens, _separator_sentence,
                    _separator_token, _separator_desc, w);
        }
    }

    private void run_parallel() throws Exception {

        InputStream in = System.in;
        if (!"-".equals(_filename_in))
            in = new FileInputStream(_filename_in);
        Stream<String> liter = new BufferedReader(new InputStreamReader(in, Charset.defaultCharset())).lines();

        ThreadLocal<ISentenceSplitter> sentenceSplitter = ThreadLocal.withInitial(() -> {
            try {
                return newSentenceSplitter();
            } catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
                throw new RuntimeException(e);
            }
        });
        ThreadLocal<ITokenizer> tokenizer = ThreadLocal.withInitial(() -> {
            try {
                return newTokenizer();
            } catch (ClassNotFoundException | InstantiationException | IllegalAccessException e) {
                throw new RuntimeException(e);
            }
        });

        final PrintWriter[] w = new PrintWriter[_parallelism];
        // init writers
        for (int i = 0; i < _parallelism; i++) {
            OutputStream out = System.out;
            if (!"-".equals(_filename_out)) {
                out = new FileOutputStream(String.format("%s_%d", _filename_out, i));
            }
            w[i] = new PrintWriter(new OutputStreamWriter(out, Charset.defaultCharset()));
        }

        BlockingQueue<Runnable> queue = new ArrayBlockingQueue<Runnable>(_parallelism * 2, true);
        ExecutorService es = new ThreadPoolExecutor(_parallelism, _parallelism, 0L, TimeUnit.MILLISECONDS, queue);

        AtomicLong lc = new AtomicLong(0);
        liter.forEach((line) -> {
            // don't try to submit new threads, wait until the thread queue has some capacity again
            while (queue.remainingCapacity() == 0)
                try {
                    Thread.sleep(10);
                } catch (InterruptedException e) {
                    /**/}
            es.submit(() -> {
                final long docid = lc.incrementAndGet();
                if (docid % 1000 == 0)
                    System.err.format("Processing line %d ('%s')%n", docid, _filename_in);
                final int w_i = (int) (docid % _parallelism);
                split_and_tokenize(new StringReader(line.trim()), String.format("%s:%d", _filename_in, docid),
                        sentenceSplitter.get(), tokenizer.get(), _level_filter, _level_normalize, _merge_types,
                        _merge_tokens, _separator_sentence, _separator_token, _separator_desc, w[w_i]);

            });
        });
        es.shutdown();
        es.awaitTermination(Integer.MAX_VALUE, TimeUnit.DAYS);

        // TODO: the stream parallelism version does not work because it submits too many threads at once
        //      AtomicLong lc = new AtomicLong(0);
        //      ForkJoinPool forkJoinPool = new ForkJoinPool(_parallelism);
        //      forkJoinPool.submit(() -> 
        //         liter.parallel().forEach((line) -> {
        //            final long docid = lc.incrementAndGet();
        //            if(docid % 1000 == 0)
        //               System.err.format("Processing line %d ('%s')%n", docid, _filename_in);
        //   
        //            String l = line.replace("\\t", "\t").replace("\\n", "\n");
        //            split_and_tokenize(
        //                  new StringReader(l),
        //                  String.format("%s:%d", _filename_in, docid),
        //                  sentenceSplitter.get(), 
        //                  tokenizer.get(), 
        //                  _level_filter,
        //                  _level_normalize,
        //                  _merge_types,
        //                  _merge_tokens,
        //                  _separator_sentence,
        //                  _separator_token,
        //                  _separator_desc,
        //                  w);
        //      })).get();

    }

    public static void split_and_tokenize(Reader reader, String docid, ISentenceSplitter sentenceSplitter,
            ITokenizer tokenizer, int level_filter, int level_normalize, boolean merge_types, boolean merge_tokens,
            String separator_sentence, String separator_token, String separator_desc, PrintWriter writer) {
        try {
            final StringBuffer buf = new StringBuffer(); // used for checking of stream is empty; take care when not running sequentially but in parallel!
            sentenceSplitter.init(reader).stream().sequential().forEach(sentence_segment -> {
                if (DEBUG) {
                    writer.format("%s%s", docid, separator_desc);
                    writer.println(sentence_segment.toString());
                    writer.print(separator_sentence);
                }
                if (sentence_segment.type != SegmentType.SENTENCE)
                    return;
                tokenizer.init(sentence_segment.asString());
                Stream<String> tokens = null;
                if (DEBUG)
                    tokens = tokenizer.stream().map(x -> x.toString() + separator_token);
                else
                    tokens = StreamSupport.stream(tokenizer
                            .filteredAndNormalizedTokens(level_filter, level_normalize, merge_types, merge_tokens)
                            .spliterator(), false).map(x -> x + separator_token);
                Spliterator<String> spliterator = tokens.spliterator();
                tokens = StreamSupport.stream(spliterator, false);
                buf.setLength(0);
                boolean empty = !spliterator.tryAdvance(x -> {
                    buf.append(x);
                });
                if (empty)
                    return;
                synchronized (writer) {
                    // writer.write(Thread.currentThread().getId() + "\t");
                    writer.format("%s%s", docid, separator_desc);
                    writer.print(buf);
                    tokens.forEach(writer::print);
                    writer.print(separator_sentence);
                    writer.flush();
                }
            });
        } catch (Exception e) {
            Throwable t = e;
            while (t != null) {
                System.err.format("%s: %s%n", e.getClass(), e.getMessage());
                t = e.getCause();
            }
        }
    }

    public ITokenizer newTokenizer() throws ClassNotFoundException, InstantiationException, IllegalAccessException {
        String packageName = ITokenizer.class.getPackage().getName();
        @SuppressWarnings("unchecked")
        Class<ITokenizer> clazz = (Class<ITokenizer>) Class
                .forName(String.format("%s.%s", packageName, _tokenizer_type));
        ITokenizer instance = clazz.newInstance();
        if (RuleTokenizer.class.getSimpleName().equals(_tokenizer_type)) {
            de.tudarmstadt.lt.seg.token.rules.RuleSet rs = de.tudarmstadt.lt.seg.token.rules.RuleSet
                    .get(_ruleset_sentence);
            ((RuleTokenizer) instance).initParam(rs);
        }
        return instance;
    }

    public ISentenceSplitter newSentenceSplitter()
            throws ClassNotFoundException, InstantiationException, IllegalAccessException {
        String packageName = ISentenceSplitter.class.getPackage().getName();
        @SuppressWarnings("unchecked")
        Class<ISentenceSplitter> clazz = (Class<ISentenceSplitter>) Class
                .forName(String.format("%s.%s", packageName, _sentence_splitter_type));
        ISentenceSplitter instance = clazz.newInstance();
        if (RuleSplitter.class.getSimpleName().equals(_sentence_splitter_type)) {
            de.tudarmstadt.lt.seg.sentence.rules.RuleSet rs = de.tudarmstadt.lt.seg.sentence.rules.RuleSet
                    .get(_ruleset_sentence);
            ((RuleSplitter) instance).initParam(rs, _boundary_as_part_of_sentence);
        }
        return instance;
    }

}