Java tutorial
/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universitt Darmstadt * <p> * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * <p> * http://www.apache.org/licenses/LICENSE-2.0 * <p> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.mallet.internal.wordembeddings; import de.tudarmstadt.ukp.dkpro.core.api.resources.CompressionUtils; import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import java.io.*; import java.util.Arrays; import java.util.Map; import java.util.stream.Collectors; /** * Helper Methods for reading word embeddings. */ public class MalletEmbeddingsUtils { private static final Log LOG = LogFactory.getLog(MalletEmbeddingsUtils.class); /** * Read an embeddings file in text format. * <p> * If hasHeader is set to true, the first line is expected to contain the size and dimensionality * of the vectors. This is typically true for files generated by Word2Vec (in text format). * * @param file the input file * @param hasHeader if true, read size and dimensionality from the first line * @return a {@code Map<String, double[]>} mapping each token to a vector. * @throws IOException if the input file cannot be read * @see #readEmbeddingFileTxt(InputStream, boolean) */ public static Map<String, double[]> readEmbeddingFileTxt(File file, boolean hasHeader) throws IOException { LOG.info("Reading embeddings from file " + file); InputStream is = CompressionUtils.getInputStream(file.getAbsolutePath(), new FileInputStream(file)); return readEmbeddingFileTxt(is, hasHeader); } /** * Read embeddings in text format from an InputStream. * Each line is expected to have a whitespace-separated list {@code <token> <value1> <value2> ...}. * * @param inputStream an {@link InputStream} * @param hasHeader if true, read size and dimensionality from the first line * @return a {@code Map<String, double[]>} mapping each token to a vector. * @throws IOException if the input file cannot be read */ public static Map<String, double[]> readEmbeddingFileTxt(InputStream inputStream, boolean hasHeader) throws IOException { BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); final int dimensions; final int size; if (hasHeader) { String[] header = reader.readLine().split(" "); assert header.length == 2; size = Integer.parseInt(header[0]); dimensions = Integer.parseInt(header[1]); } else { dimensions = -1; size = -1; } Map<String, double[]> embeddings = reader.lines().map(MalletEmbeddingsUtils::lineToEmbedding) .collect(Collectors.toMap(Pair::getKey, Pair::getValue)); reader.close(); /* assert size and dimension */ if (hasHeader) { /* check that size read matches header information */ LOG.debug("Checking number and vector sizes for all embeddings."); assert size == embeddings.size(); assert embeddings.values().stream().allMatch(vector -> dimensions == vector.length); } else { LOG.debug("Checking vector sizes for all embeddings."); int firstLength = embeddings.values().stream().findAny().get().length; assert embeddings.values().stream().allMatch(vector -> firstLength == vector.length); } return embeddings; } /** * Convert a single line in the expected format ({@code <token> <value1> ... <valueN>} int a pair * holding the token and the corresponding vector. * * @param line a line * @return a {@link Pair} */ private static Pair<String, double[]> lineToEmbedding(String line) { String[] array = line.split(" "); int size = array.length; double[] vector = Arrays.stream(array, 1, size).mapToDouble(Double::parseDouble).toArray(); return Pair.of(array[0], vector); } }