Java tutorial
/* * Copyright Paolo Dragone 2014 * * This file is part of WiktionarySemanticNetwork. * * WiktionarySemanticNetwork is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * WiktionarySemanticNetwork is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with WiktionarySemanticNetwork. If not, see <http://www.gnu.org/licenses/>. */ package com.paolodragone.wsn.extraction; import com.google.common.base.Stopwatch; import com.paolodragone.util.io.parsers.wiktionary.WiktionaryXmlParser; import com.paolodragone.wsn.WsnConfiguration; import com.paolodragone.wsn.dataset.SensesDataSet; import com.paolodragone.wsn.entities.Page; import com.paolodragone.wsn.entities.Sense; import com.paolodragone.wsn.entities.SenseSet; import com.paolodragone.wsn.parser.WsnPageComponent; import com.paolodragone.wsn.parser.WsnPageParser; import com.paolodragone.wsn.util.Senses; import java.io.Reader; import java.io.Writer; import java.nio.file.Files; import java.nio.file.Path; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.TimeUnit; import java.util.stream.Stream; /** * @author Paolo Dragone */ public class SenseExtractor { public static void main(String[] args) { Stopwatch stopwatch = Stopwatch.createStarted(); try { WsnConfiguration configuration = WsnConfiguration.getInstance(); Path wiktionaryDumpFilePath = configuration.getWiktionaryDumpFilePath(); Path sensesFilePath = configuration.getSensesFilePath(); SenseExtractor extractor = new SenseExtractor(); SenseCountPrinter senseCountPrinter = new SenseCountPrinter(extractor); Timer senseCountPrinterTimer = new Timer("SenseCountPrinter", true); senseCountPrinterTimer.scheduleAtFixedRate(senseCountPrinter, 0, 1000); Reader wiktionaryDumpFileReader = Files.newBufferedReader(wiktionaryDumpFilePath); Stream<Sense> senseStream = extractor.extractSenses(wiktionaryDumpFileReader); SensesDataSet sensesDataSet = new SensesDataSet(); Writer sensesFileWriter = Files.newBufferedWriter(sensesFilePath); sensesDataSet.writeEntities(senseStream, sensesFileWriter); } catch (Exception e) { e.printStackTrace(); } long elapsed = stopwatch.stop().elapsed(TimeUnit.MINUTES); System.out.println("\nTotal time: " + elapsed + " min"); } private static class SenseCountPrinter extends TimerTask { public static final String messageFormat = "\rSenses: %d\t\t%d sec\t\t%d senses/sec"; private final SenseExtractor extractor; private final Stopwatch stopwatch = Stopwatch.createStarted(); public SenseCountPrinter(SenseExtractor extractor) { this.extractor = extractor; } @Override public void run() { int senseCount = extractor.getSenseCount(); long timeSpan = stopwatch.elapsed(TimeUnit.SECONDS); if (timeSpan > 0) { String message = String.format(messageFormat, senseCount, timeSpan, senseCount / timeSpan); System.out.print(message); } } } private int senseCount = 0; public int getSenseCount() { return senseCount; } public Stream<Sense> extractSenses(Reader wiktionaryDumpFileReader) throws Exception { WiktionaryXmlParser wiktionaryXmlParser = new WiktionaryXmlParser(wiktionaryDumpFileReader); WsnPageComponent[] components = { WsnPageComponent.GLOSSES }; WsnPageParser pageParser = new WsnPageParser(components); Stream<Page> pageStream = pageParser.parsePages(wiktionaryXmlParser.parallelStream()); Stream<Sense> senseStream = Senses .filterValidSenses(SenseSet.filterValidSenseSets(pageStream.flatMap(Page::getSenseSetStream)) .flatMap(SenseSet::getSenseStream)); senseStream = senseStream.peek(s -> { synchronized (this) { s.setId(++senseCount); } }); return senseStream; } }