com.paolodragone.wsn.extraction.SenseExtractor.java Source code

Java tutorial

Introduction

Here is the source code for com.paolodragone.wsn.extraction.SenseExtractor.java

Source

/*
 * Copyright Paolo Dragone 2014
 *
 * This file is part of WiktionarySemanticNetwork.
 *
 * WiktionarySemanticNetwork is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * WiktionarySemanticNetwork is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with WiktionarySemanticNetwork.  If not, see <http://www.gnu.org/licenses/>.
 */

package com.paolodragone.wsn.extraction;

import com.google.common.base.Stopwatch;
import com.paolodragone.util.io.parsers.wiktionary.WiktionaryXmlParser;
import com.paolodragone.wsn.WsnConfiguration;
import com.paolodragone.wsn.dataset.SensesDataSet;
import com.paolodragone.wsn.entities.Page;
import com.paolodragone.wsn.entities.Sense;
import com.paolodragone.wsn.entities.SenseSet;
import com.paolodragone.wsn.parser.WsnPageComponent;
import com.paolodragone.wsn.parser.WsnPageParser;
import com.paolodragone.wsn.util.Senses;

import java.io.Reader;
import java.io.Writer;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Timer;
import java.util.TimerTask;
import java.util.concurrent.TimeUnit;
import java.util.stream.Stream;

/**
 * @author Paolo Dragone
 */
public class SenseExtractor {

    public static void main(String[] args) {
        Stopwatch stopwatch = Stopwatch.createStarted();
        try {
            WsnConfiguration configuration = WsnConfiguration.getInstance();
            Path wiktionaryDumpFilePath = configuration.getWiktionaryDumpFilePath();
            Path sensesFilePath = configuration.getSensesFilePath();

            SenseExtractor extractor = new SenseExtractor();
            SenseCountPrinter senseCountPrinter = new SenseCountPrinter(extractor);
            Timer senseCountPrinterTimer = new Timer("SenseCountPrinter", true);
            senseCountPrinterTimer.scheduleAtFixedRate(senseCountPrinter, 0, 1000);

            Reader wiktionaryDumpFileReader = Files.newBufferedReader(wiktionaryDumpFilePath);
            Stream<Sense> senseStream = extractor.extractSenses(wiktionaryDumpFileReader);

            SensesDataSet sensesDataSet = new SensesDataSet();
            Writer sensesFileWriter = Files.newBufferedWriter(sensesFilePath);
            sensesDataSet.writeEntities(senseStream, sensesFileWriter);

        } catch (Exception e) {
            e.printStackTrace();
        }
        long elapsed = stopwatch.stop().elapsed(TimeUnit.MINUTES);
        System.out.println("\nTotal time: " + elapsed + " min");
    }

    private static class SenseCountPrinter extends TimerTask {

        public static final String messageFormat = "\rSenses: %d\t\t%d sec\t\t%d senses/sec";

        private final SenseExtractor extractor;
        private final Stopwatch stopwatch = Stopwatch.createStarted();

        public SenseCountPrinter(SenseExtractor extractor) {
            this.extractor = extractor;
        }

        @Override
        public void run() {
            int senseCount = extractor.getSenseCount();
            long timeSpan = stopwatch.elapsed(TimeUnit.SECONDS);
            if (timeSpan > 0) {
                String message = String.format(messageFormat, senseCount, timeSpan, senseCount / timeSpan);
                System.out.print(message);
            }
        }
    }

    private int senseCount = 0;

    public int getSenseCount() {
        return senseCount;
    }

    public Stream<Sense> extractSenses(Reader wiktionaryDumpFileReader) throws Exception {

        WiktionaryXmlParser wiktionaryXmlParser = new WiktionaryXmlParser(wiktionaryDumpFileReader);

        WsnPageComponent[] components = { WsnPageComponent.GLOSSES };
        WsnPageParser pageParser = new WsnPageParser(components);
        Stream<Page> pageStream = pageParser.parsePages(wiktionaryXmlParser.parallelStream());

        Stream<Sense> senseStream = Senses
                .filterValidSenses(SenseSet.filterValidSenseSets(pageStream.flatMap(Page::getSenseSetStream))
                        .flatMap(SenseSet::getSenseStream));

        senseStream = senseStream.peek(s -> {
            synchronized (this) {
                s.setId(++senseCount);
            }
        });
        return senseStream;
    }
}