Java tutorial
/* * Copyright 2009 Sky Lin * * This file is part of the S-Space package and is covered under the terms and * conditions therein. * * The S-Space package is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as published * by the Free Software Foundation and distributed hereunder to you. * * THIS SOFTWARE IS PROVIDED "AS IS" AND NO REPRESENTATIONS OR WARRANTIES, * EXPRESS OR IMPLIED ARE MADE. BY WAY OF EXAMPLE, BUT NOT LIMITATION, WE MAKE * NO REPRESENTATIONS OR WARRANTIES OF MERCHANT- ABILITY OR FITNESS FOR ANY * PARTICULAR PURPOSE OR THAT THE USE OF THE LICENSED SOFTWARE OR DOCUMENTATION * WILL NOT INFRINGE ANY THIRD PARTY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER * RIGHTS. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package edu.ucla.sspace.lra; import static edu.ucla.sspace.common.Similarity.cosineSimilarity; import edu.ucla.sspace.matrix.LogEntropyTransform; import edu.ucla.sspace.matrix.Matrices; import edu.ucla.sspace.matrix.Matrix; import edu.ucla.sspace.matrix.MatrixFactorization; import edu.ucla.sspace.matrix.MatrixFile; import edu.ucla.sspace.matrix.MatrixIO; import edu.ucla.sspace.matrix.SVD; import edu.ucla.sspace.matrix.Transform; import edu.ucla.sspace.util.HashMultiMap; import edu.ucla.sspace.util.BoundedSortedMap; import edu.ucla.sspace.util.Pair; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileReader; import java.io.IOError; import java.io.IOException; import java.io.PrintStream; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.lang.Float; import java.lang.Integer; import java.lang.Math; import java.util.Collections; import java.util.List; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.Scanner; import java.util.SortedMap; import edu.smu.tspell.wordnet.*; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.WhitespaceAnalyzer; import org.apache.lucene.document.DateTools; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.index.Term; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.HitCollector; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Searcher; import org.apache.lucene.search.Similarity; import org.apache.lucene.search.Query; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; /** * An implementation of Latent Relational Analysis (LRA). This implementation is * based on two papers. * <ul> * * <li style="font-family:Garamond, Georgia, serif"> Peter D. Turney * (2004). Human-Level Performance on Word Analogy Questions by * Latent Relational Analysis. Available <a * href="http://iit-iti.nrc-cnrc.gc.ca/iit-publications-iti/docs/NRC-47422.pdf">here</a> </li> * * <li style="font-family:Garamond, Georgia, serif"> Peter D. Turney (2005). * Measuring Semantic Similarity by Latent Relational Analysis. * Available * <a href="http://portal.acm.org/citation.cfm?id=1174523">here</a> * </li> * * </ul> * <p> * * LRA uses three main components to analyze a large corpus of text in order to * measure relational similarity between pairs of words (i.e. analogies). LRA * uses the search engine to find patterns based on the input set as well as its * corresponding alternates (see {@link #loadAnalogiesFromFile(String)}). A * sparse matrix is then generated, where each value in the matrix is the number * of times the row's word pair occurs with the column's pattern between * them.<p> * * After the matrix has been built, the <a * href="http://en.wikipedia.org/wiki/Singular_value_decomposition">Singular * Value Decomposition</a> (SVD) is used to reduce the dimensionality of the * original word-document matrix, denoted as <span style="font-family:Garamond, * Georgia, serif">A</span>. The SVD is a way of factoring any matrix A into * three matrices <span style="font-family:Garamond, Georgia, serif">U Σ * V<sup>T</sup></span> such that <span style="font-family:Garamond, Georgia, * serif"> Σ </span> is a diagonal matrix containing the singular values * of <span style="font-family:Garamond, Georgia, serif">A</span>. The singular * values of <span style="font-family:Garamond, Georgia, serif"> Σ </span> * are ordered according to which causes the most variance in the values of * <span style="font-family:Garamond, Georgia, serif">A</span>. The original * matrix may be approximated by recomputing the matrix with only <span * style="font-family:Garamond, Georgia, serif">k</span> of these singular * values and setting the rest to 0. The approximated matrix <span * style="font-family:Garamond, Georgia, serif"> Â = U<sub>k</sub> * Σ<sub>k</sub> V<sub>k</sub><sup>T</sup></span> is the least squares * best-fit rank-<span style="font-family:Garamond, Georgia, serif">k</span> * approximation of <span style="font-family:Garamond, Georgia, serif">A</span>. * LRA reduces the dimensions by keeping only the first <span * style="font-family:Garamond, Georgia, serif">k</span> dimensions from the row * vectors of <span style="font-family:Garamond, Georgia, serif">U</span> and the * <span style="font-family:Garamond, Georgia, serif">k</span> dimensions from the * column vectors of <span style="font-family:Garamond, Georgia, serif">Σ</span>. * The projection matrix <span style="font-family:Garamond, Georgia, serif">UΣ * </span> is then used to calculate the relational similarities between pairs using * the row vectors corresponding to the word pairs.<p> * * This class uses the <a href="http://lucene.apache.org/java/docs/">Apache * Lucune Search Engine</a> for optimal indexing and filtering of word pairs * using any given corpus. This class also uses Wordnet through the <a * href="http://lyle.smu.edu/~tspell/jaws/index.html">JAWS</a> interface in * order to find alternate word pairs from given input pairs. * * * @author Sky Lin **/ public class LatentRelationalAnalysis { //TODO: have a way to set these values public static final String LRA_DIMENSIONS_PROPERTY = "edu.ucla.sspace.lra.LRA.dimensions"; public static final String LRA_INDEX_DIR = "edu.ucla.sspace.lra.LRA.index_dir"; public static final String LRA_SKIP_INDEX = "edu.ucla.sspace.lra.LRA.skip_index"; public static final String LRA_READ_MATRIX_FILE = "edu.ucla.sspace.lra.LRA.readMatrixFile"; public static final String LRA_WRITE_MATRIX_FILE = "edu.ucla.sspace.lra.LRA.writeMatrixFile"; private static final int NUM_SIM = 10; private static final int MAX_PHRASE = 5; private static final int NUM_FILTER = 3; private static final int MAX_INTER = 3; private static final int MIN_INTER = 1; private static final int NUM_PATTERNS = 4000; private List<String> original_pairs; private List<String> filtered_phrases; private Map<String, ArrayList<String>> original_to_alternates; private BoundedSortedMap<InterveningWordsPattern, Integer> pattern_list; private Map<Integer, String> matrix_row_map; private Map<Integer, InterveningWordsPattern> matrix_column_map; private String INDEX_DIR; private String DATA_DIR; /** * The {@link MatrixFactorization} algorithm that will decompose the word by * document feature space into two smaller feature spaces: a word by class * feature space and a class by feature space. */ private final MatrixFactorization reducer; /** * Constructor for {@code LatentRelationalAnalysis}. * * @param corpus_directory a {@code String} containing the absolute path to * the directory containing the corpus * @param index_directory a {@code String} containing the absolute path to * the directory where the index created by Lucene will be stored * @param do_index {@code true} if the index step should be performed. * {@code false} if the index file already exists under * index_directory (will skip indexing step). * */ public LatentRelationalAnalysis(String corpus_directory, String index_directory, boolean do_index, MatrixFactorization reducer) { this.reducer = reducer; //set system property for Wordnet database directory Properties sysProps = System.getProperties(); sysProps.setProperty("wordnet.database.dir", "/usr/share/wordnet"); System.err.println("starting LRA...\n"); INDEX_DIR = index_directory; DATA_DIR = corpus_directory; if (do_index) { initializeIndex(INDEX_DIR, DATA_DIR); } original_pairs = new ArrayList<String>(); filtered_phrases = new ArrayList<String>(); original_to_alternates = new HashMap<String, ArrayList<String>>(); pattern_list = new BoundedSortedMap<InterveningWordsPattern, Integer>(NUM_PATTERNS); matrix_column_map = new HashMap<Integer, InterveningWordsPattern>(); matrix_row_map = new HashMap<Integer, String>(); } /** * Loads the analogies from an input file. * The file must contain word pairs in the form of A:B separated by newlines. * * @param analogy_file a {@code String} containing the absolute path to the analogy file. */ public void loadAnalogiesFromFile(String analogy_file) { try { Scanner sc = new Scanner(new File(analogy_file)); while (sc.hasNext()) { String analogy = sc.next(); if (!isAnalogyFormat(analogy)) { System.err.println("\"" + analogy + "\" not in proper format."); continue; } String analogy_pair[] = analogy.split(":"); String A = analogy_pair[0]; String B = analogy_pair[1]; //1. Find alternates for A and B Synset[] A_prime = findAlternatives(A); Synset[] B_prime = findAlternatives(B); //2. Filter phrases ArrayList<String> tmp = new ArrayList<String>(filterPhrases(INDEX_DIR, A, B, A_prime, B_prime)); filtered_phrases.addAll(tmp); original_to_alternates.put(A + ":" + B, tmp); } sc.close(); } catch (Exception e) { System.err.println("Could not read file."); } } /** * Returns the synonyms for the specified term. The synonyms will be taken * directly from the WordNet database. This is used by LRA to find * alternative pairs. Given an input set of A:B. For each A' that is * similar to A, make a new pair A':B. Likewise for B. * * @param term a {@code String} containing a single word * @return an array of all the synonyms */ public static Synset[] findAlternatives(String term) { WordNetDatabase database = WordNetDatabase.getFileInstance(); Synset[] all = database.getSynsets(term); return all; } /** * Initializes an index given the index directory and data directory. * * @param indexDir a {@code String} containing the directory where the index * will be stored * @param dataDir a {@code String} containing the directory where the data * is found */ public static void initializeIndex(String indexDir, String dataDir) { File indexDir_f = new File(indexDir); File dataDir_f = new File(dataDir); long start = new Date().getTime(); try { int numIndexed = index(indexDir_f, dataDir_f); long end = new Date().getTime(); System.err.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } catch (IOException e) { System.err.println("Unable to index " + indexDir_f + ": " + e.getMessage()); } } //creates the index files private static int index(File indexDir, File dataDir) throws IOException { if (!dataDir.exists() || !dataDir.isDirectory()) { throw new IOException(dataDir + " does not exist or is not a directory"); } IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), true, IndexWriter.MaxFieldLength.UNLIMITED); writer.setUseCompoundFile(false); indexDirectory(writer, dataDir); int numIndexed = writer.numDocs(); writer.optimize(); writer.close(); return numIndexed; } /** * recursive method that finds interleving patterns between A and B in all files * within a given directory **/ private static HashSet<String> searchDirectoryForPattern(File dir, String A, String B) throws Exception { File[] files = dir.listFiles(); HashSet<String> pattern_set = new HashSet<String>(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { pattern_set.addAll(searchDirectoryForPattern(f, A, B)); } else if (f.getName().endsWith(".txt")) { Scanner sc = new Scanner(f); while (sc.hasNext()) { if (A.equals(sc.next())) { String pattern = ""; int count = 0; while (count <= MAX_INTER && sc.hasNext()) { String curr = sc.next(); if (count >= MIN_INTER && B.equals(curr)) { //add the String onto a Set of Strings containing the patterns //System.err.println("adding pattern: " + pattern); pattern_set.add(pattern); break; /* for (int j = 0; j < count; j++) { System.err.print(pattern[j] + " "); } System.err.print("\n"); */ } else { if (count > 0) { pattern += " "; } pattern += curr; count++; } } } } } } return pattern_set; } /** * recursive method that calls itself when it finds a directory, or indexes if * it is at a file ending in ".txt" **/ private static void indexDirectory(IndexWriter writer, File dir) throws IOException { File[] files = dir.listFiles(); for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { indexDirectory(writer, f); } else if (f.getName().endsWith(".txt")) { indexFile(writer, f); } } } /** * method to actually index a file using Lucene, adds a document * onto the index writer **/ private static void indexFile(IndexWriter writer, File f) throws IOException { if (f.isHidden() || !f.exists() || !f.canRead()) { System.err.println("Could not write " + f.getName()); return; } System.err.println("Indexing " + f.getCanonicalPath()); Document doc = new Document(); doc.add(new Field("path", f.getCanonicalPath(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("modified", DateTools.timeToString(f.lastModified(), DateTools.Resolution.MINUTE), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("contents", new FileReader(f))); writer.addDocument(doc); } /** * Searches an index given the index directory and counts up the frequncy of the two words used in a phrase. * * @param indexDir a String containing the directory where the index is stored * @param A a {@code String} containing the first word of the phrase * @param B a {@code String} containing the last word of the phrase * @return float */ public static float countPhraseFrequencies(String indexDir, String A, String B) { File indexDir_f = new File(indexDir); if (!indexDir_f.exists() || !indexDir_f.isDirectory()) { System.err.println("Search failed: index directory does not exist"); } else { try { return searchPhrase(indexDir_f, A, B); } catch (Exception e) { System.err.println("Unable to search " + indexDir); return 0; } } return 0; } //method that actually does the searching private static float searchPhrase(File indexDir, String A, String B) throws Exception { Directory fsDir = FSDirectory.getDirectory(indexDir); IndexSearcher searcher = new IndexSearcher(fsDir); long start = new Date().getTime(); QueryParser parser = new QueryParser("contents", new StandardAnalyzer()); //System.err.println("searching for: '\"" + A + " " + B + "\"~"+MAX_PHRASE+"'"); parser.setPhraseSlop(MAX_PHRASE); String my_phrase = "\"" + A + " " + B + "\""; Query query = parser.parse(my_phrase); //System.err.println("total hits: " + results.totalHits); //set similarity to use only the frequencies //score is based on frequency of phrase only searcher.setSimilarity(new Similarity() { public static final long serialVersionUID = 1L; public float coord(int overlap, int maxOverlap) { return 1; } public float queryNorm(float sumOfSquaredWeights) { return 1; } public float tf(float freq) { return freq; } public float idf(int docFreq, int numDocs) { return 1; } public float lengthNorm(String fieldName, int numTokens) { return 1; } public float sloppyFreq(int distance) { return 1; } }); TopDocs results = searcher.search(query, 10); ScoreDoc[] hits = results.scoreDocs; float total_score = 0; //add up the scores for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); //System.err.printf("%5.3f %sn\n", // hit.score, doc.get("contents")); total_score += hit.score; } long end = new Date().getTime(); searcher.close(); return total_score; } /** * Returns an ArrayList of phrases with the greatest frequencies in the corpus. * For each alternate pair, send a phrase query to the Lucene search engine * containing the corpus. The phrase query will find the frequencies of phrases * that begin with one member of the pair and end with the other. The phrases * cannot have more than MAX_PHRASE words. * Select the top NUM_FILTER (current NUM_FILTER=3) most frequent phrases and * * return them along with the original pairs. * * NOTE: should be called before {@link #findPatterns()}. * * @param A a {@code String} containing the first member in the original pair * @param B a {@code String} containing the second member in the original pair * @param A_prime a {@code Synset} array containing the alternates for A * @param B_prime a {@code Synset} array containing the alternates for B * @return an ArrayList of {@code String} with the top NUM_FILTER pairs along with the original pairs */ public static ArrayList<String> filterPhrases(String INDEX_DIR, String A, String B, Synset[] A_prime, Synset[] B_prime) { HashMultiMap<Float, Pair<String>> phrase_frequencies = new HashMultiMap<Float, Pair<String>>(); //Search corpus... A:B //phrase_frequencies.put(new Float(countPhraseFrequencies(INDEX_DIR, A, B)),new Pair<String>(A,B)); //System.err.println("Top 10 Similar words:"); int count = 0; for (int i = 0; (i < NUM_SIM && i < A_prime.length); i++) { String[] wordForms = A_prime[i].getWordForms(); for (int j = 0; j < wordForms.length; j++) { if (wordForms[j].compareTo(A) != 0) { //Search corpus... A':B Float score = new Float(countPhraseFrequencies(INDEX_DIR, wordForms[j], B)); phrase_frequencies.put(score, new Pair<String>(wordForms[j], B)); count++; } if (count >= NUM_SIM) break; } if (count >= NUM_SIM) break; } count = 0; for (int i = 0; (i < NUM_SIM && i < B_prime.length); i++) { String[] wordForms = B_prime[i].getWordForms(); for (int j = 0; j < wordForms.length; j++) { if (wordForms[j].compareTo(B) != 0) { //Search corpus... A:B' Float score = new Float(countPhraseFrequencies(INDEX_DIR, A, wordForms[j])); phrase_frequencies.put(score, new Pair<String>(A, wordForms[j])); count++; } if (count >= NUM_SIM) break; } if (count >= NUM_SIM) break; } // filter out the phrases and add the top 3 to the ArrayList, and return it Iterator iter = phrase_frequencies.keySet().iterator(); //TODO: make number of filters dynamic //create Array with size = num filters ArrayList<String> filtered_phrases = new ArrayList<String>(); Float filter1 = new Float(0.0); Float filter2 = new Float(0.0); Float filter3 = new Float(0.0); while (iter.hasNext()) { Float curr_key = (Float) iter.next(); //this will bump the filters up each time a greater value comes along //so that filter1 will be the greatest key and filter3 the 3rd greatest if (curr_key > filter1) { filter3 = filter2; filter2 = filter1; filter1 = curr_key; } else if (curr_key > filter2) { filter3 = filter2; filter2 = curr_key; } else if (curr_key > filter3) { filter3 = curr_key; } } int filter_count = 0; Iterator val_iter = phrase_frequencies.get(filter1).iterator(); while (val_iter.hasNext() && filter_count < 3) { String alternative_pair = val_iter.next().toString(); String pair_arr[] = parsePair(alternative_pair); filtered_phrases.add(pair_arr[0] + ":" + pair_arr[1]); filter_count++; } val_iter = phrase_frequencies.get(filter2).iterator(); while (val_iter.hasNext() && filter_count < 3) { String alternative_pair = val_iter.next().toString(); String pair_arr[] = parsePair(alternative_pair); filtered_phrases.add(pair_arr[0] + ":" + pair_arr[1]); filter_count++; } val_iter = phrase_frequencies.get(filter3).iterator(); while (val_iter.hasNext() && filter_count < 3) { String alternative_pair = val_iter.next().toString(); String pair_arr[] = parsePair(alternative_pair); filtered_phrases.add(pair_arr[0] + ":" + pair_arr[1]); filter_count++; } //throw in the original pair also filtered_phrases.add(A + ":" + B); return filtered_phrases; } /** * Makes patterns by replacing words in str with wildcards based on the binary value of c. */ private static String combinatorialPatternMaker(String[] str, int str_size, int c) { String comb_pattern = ""; int curr_comb = 1; for (int i = 0; i < str_size; i++) { if ((c & curr_comb) != 0) { comb_pattern += str[i] + "\\s"; } else { comb_pattern += "[\\w]+\\s"; } curr_comb = curr_comb << 1; } //System.err.println(comb_pattern); return comb_pattern; } /** * Searches through all the .txt files in a directory and returns the total number * of occurrences of a pattern. */ private static int countWildcardPhraseFrequencies(File dir, String pattern) throws Exception { File[] files = dir.listFiles(); int total = 0; for (int i = 0; i < files.length; i++) { File f = files[i]; if (f.isDirectory()) { total += countWildcardPhraseFrequencies(f, pattern); } else if (f.getName().endsWith(".txt")) { Scanner sc = new Scanner(f); while (sc.hasNext()) { String line = sc.nextLine(); if (line.matches(pattern)) { total++; } } } } return total; } /** * parses a pair in the form {A, B} **/ private static String[] parsePair(String pair) { String[] tmp = new String[2]; int indexOfA = pair.indexOf('{') + 1; int indexOfB = pair.indexOf(','); tmp[0] = pair.substring(indexOfA, indexOfB); tmp[1] = pair.substring(indexOfB + 2, pair.length() - 1); return tmp; } /** * Finds patterns using the filtered phrases. Should be called after {@link * #filterPhrases(String,String,Syntex[],Synset[]) filterPhrases}. **/ public void findPatterns() throws Exception { HashSet<String> patterns = new HashSet<String>(); for (String phrase : filtered_phrases) { String phrase_arr[] = phrase.split(":"); String A = phrase_arr[0]; String B = phrase_arr[1]; //System.err.println(A + ": " + B); patterns.addAll(searchDirectoryForPattern(new File(DATA_DIR), A, B)); } Iterator iter = patterns.iterator(); while (iter.hasNext()) { String curr_pattern_str = (String) iter.next(); String[] curr_pattern = curr_pattern_str.split("\\s"); int curr_length = curr_pattern.length; //System.err.println("length of pattern: " + curr_length); //do a for loop with all combinatorials of wildcard patterns //for each iteration do a wildcard search for (int comb = 0; comb < (int) Math.pow(2.0, (double) curr_length); comb++) { String comb_pattern = "\\s" + combinatorialPatternMaker(curr_pattern, curr_length, comb); try { int score = countWildcardPhraseFrequencies(new File(DATA_DIR), ".*" + comb_pattern + ".*"); InterveningWordsPattern db_pattern = new InterveningWordsPattern(comb_pattern); db_pattern.setOccurrences(score); pattern_list.put(db_pattern, score); //insert the pattern into database (only if it has a high enough score) //System.err.println(comb_pattern + ": " + score); } catch (Exception e) { System.err.println("could not perform wildcard search"); } } } } /** * Maps a list of patterns to the columns of the sparse matrix. Takes the * results of {@link #findPattern()} and maps it to the column indeces of a sparse * matrix. */ public void mapColumns() { //System.err.print("Patterns found: "); //System.err.println(pattern_list.size()); int index = 0; //NOTE: occurrences can be used as Sigma X<k,j> when calculating Entropy for (InterveningWordsPattern a_pattern : pattern_list.keySet()) { //int val = a_pattern.getOccurrences(); //System.err.println(a_pattern.getPattern() + " " + val); matrix_column_map.put(new Integer(index), a_pattern); index++; InterveningWordsPattern b_pattern = new InterveningWordsPattern(a_pattern.getPattern()); b_pattern.setOccurrences(a_pattern.getOccurrences()); b_pattern.setReverse(true); matrix_column_map.put(new Integer(index), b_pattern); index++; } } /** * Maps a list of phrases to the rows of the sparse matrix. * Takes an ArrayList containing the filtered phrases (originals and alternates) and maps them to the sparse matrix. * * @param phrases an ArrayList containing the filtered phrases * @return void */ public void mapRows() { int index = 0; for (String a_phrase : filtered_phrases) { String[] curr = a_phrase.split(":"); String A = curr[0]; String B = curr[1]; matrix_row_map.put(new Integer(index), A + ":" + B); index++; //add reverse pair as well matrix_row_map.put(new Integer(index), B + ":" + A); index++; } } /** * Creates the sparse matrix. Should be called after {@link * #findPatterns()}, {@link #mapRows()}, and {@link #mapColumns()}. The * returned {@code Matrix} should be used in the SVD process. * * @return the sparse Matrix. **/ public Matrix createSparseMatrix() { Matrix m = Matrices.create(matrix_row_map.size(), matrix_column_map.size(), false); for (int row_num = 0; row_num < matrix_row_map.size(); row_num++) { // for each pattern String p = matrix_row_map.get(new Integer(row_num)); String[] p_sp = p.split(":"); String a = p_sp[0]; String b = p_sp[1]; for (int col_num = 0; col_num < matrix_column_map.size(); col_num++) { // for each phrase InterveningWordsPattern col_pattern = matrix_column_map.get(new Integer(col_num)); String pattern = col_pattern.getPattern(); String comb_patterns; if (col_pattern.getReverse()) { //if the column is a reverse pattern...word2 P word1 comb_patterns = ".*\\s" + b + pattern + a + "\\s.*"; } else { comb_patterns = ".*\\s" + a + pattern + b + "\\s.*"; } try { m.set(row_num, col_num, (double) countWildcardPhraseFrequencies(new File(DATA_DIR), comb_patterns)); } catch (Exception e) { System.err.println("could not perform wildcard search"); } } } System.err.println("\nCompleted matrix generation."); //System.err.println("Number of rows: " + m.rows()); //System.err.println("Number of cols: " + m.columns()); return m; } /** * Applies log and entropy transformations to the sparse matrix [Landauer and Dumais, 1997]. * * @return the sparse Matrix after log and entropy transformations. **/ public Matrix applyEntropyTransformations(Matrix mat) { int n = mat.columns(); int m = mat.rows(); for (int col_num = 0; col_num < n; col_num++) { double col_total = 0.0; for (int row_num = 0; row_num < m; row_num++) { col_total += mat.get(row_num, col_num); } //System.err.println("coltotal: " + col_total); if (col_total == 0.0) continue; double entropy = 0.0; for (int row_num = 0; row_num < m; row_num++) { double p = mat.get(row_num, col_num) / col_total; //System.err.print(p + " "); if (p == 0.0) continue; entropy += p * Math.log10(p); } //System.err.println("entropy: " + entropy); entropy *= -1; double w = 1 - entropy / Math.log10(m); //System.err.println("w: " + w); for (int row_num = 0; row_num < m; row_num++) { mat.set(row_num, col_num, w * Math.log10(mat.get(row_num, col_num) + 1.0)); } } return mat; } /** * returns the index of the String in the HashMap, or -1 if value was not found. **/ private static int getIndexOfPair(String value, Map<Integer, String> row_data) { for (Integer i : row_data.keySet()) { if (row_data.get(i).equals(value)) { return i.intValue(); } } return -1; } /** * Computes the cosine similarity of an analogy using the projection matrix. * The relational similarity between A:B and C:D is the average of the cosines * values between combinations of the similar pairs. The cosines from * the similar pairs must be greater than or equal to the cosine of the * original pairs, A:B and C:D. * * @param analogy a String containing the two pairs to compare. The analogy must be in the form A:B::C:D, where A:B and C:D are two analogies from the input set * @param m the projection Matrix * @return a double value containing the cosine similarity value of the analogy **/ public double computeCosineSimilarity(String analogy, Matrix m) { double cosineVals = 0.0; int totalVals = 0; if (!isAnalogyFormat(analogy, true)) { System.err.println("Analogy: \"" + analogy + "\" not in proper format"); return 0.0; } String pairs[] = analogy.split("::"); String pair1 = pairs[0]; String pair2 = pairs[1]; if (!isAnalogyFormat(pair1) || !isAnalogyFormat(pair2)) { System.err.println("Analogy: \"" + analogy + "\" not in proper format"); return 0.0; } if (!original_to_alternates.containsKey(pair1) || !original_to_alternates.containsKey(pair2)) { //check if the reverse pair exists String pair1_pair[] = pair1.split(":"); String pair1_a = pair1_pair[1]; String pair1_b = pair1_pair[0]; String pair2_pair[] = pair2.split(":"); String pair2_a = pair2_pair[1]; String pair2_b = pair2_pair[0]; pair1 = pair1_a + ":" + pair1_b; pair2 = pair2_a + ":" + pair2_b; if (!original_to_alternates.containsKey(pair1) || !original_to_alternates.containsKey(pair2)) { System.err.println("Analogy: \"" + analogy + "\" not included in original pairs"); return 0.0; } } double original_cosineVal = cosineSimilarity(m.getRow(getIndexOfPair(pair1, matrix_row_map)), m.getRow(getIndexOfPair(pair2, matrix_row_map))); cosineVals += original_cosineVal; totalVals++; //System.err.println("orig cos: " + cosineVals); ArrayList<String> alternates1 = original_to_alternates.get(pair1); ArrayList<String> alternates2 = original_to_alternates.get(pair2); for (String a : alternates1) { for (String b : alternates2) { int a_index = getIndexOfPair(a, matrix_row_map); int b_index = getIndexOfPair(b, matrix_row_map); if (a_index != -1 && b_index != -1) { double alternative_cosineVal = cosineSimilarity(m.getRow(a_index), m.getRow(b_index)); //System.err.println("adding cos: " + alternative_cosineVal); if (alternative_cosineVal >= original_cosineVal) { cosineVals += alternative_cosineVal; totalVals++; } } } } if (totalVals > 0) { return cosineVals / totalVals; } else { return 0.0; } } /** * Does the Singular Value Decomposition using the generated sparse matrix. * The dimensions used cannot exceed the number of columns in the original * matrix. * * @param sparse_matrix the sparse {@code Matrix} * @param dimensions the number of singular values to calculate * @return The decomposed word space {@link Matrix} */ public Matrix computeSVD(Matrix sparse_matrix, int dimensions) { try { File rawTermDocMatrix = File.createTempFile("lra-term-document-matrix", ".dat"); MatrixIO.writeMatrix(sparse_matrix, rawTermDocMatrix, MatrixIO.Format.SVDLIBC_SPARSE_TEXT); MatrixFile mFile = new MatrixFile(rawTermDocMatrix, MatrixIO.Format.SVDLIBC_SPARSE_TEXT); reducer.factorize(mFile, dimensions); return reducer.dataClasses(); } catch (IOException ioe) { throw new IOError(ioe); } } /** * Reads analogies from file and outputs their cosine similarities to another file. * * @param projection the projection {@code Matrix} * @param inputFileName the input file containing analogies in the proper format * separated by newlines * @param outputFileName the output file where the results will be stored * @return void * * @see #computeCosineSimilarity(String,Matrix) **/ public void evaluateAnalogies(Matrix projection, String inputFileName, String outputFileName) { try { Scanner sc = new Scanner(new File(inputFileName)); PrintStream out = new PrintStream(new FileOutputStream(outputFileName)); while (sc.hasNext()) { String analogy = sc.next(); if (!isAnalogyFormat(analogy, true)) { System.err.println("\"" + analogy + "\" not in proper format."); continue; } double cosineVal = computeCosineSimilarity(analogy, projection); //does the actual cosine value calculations and comparisons out.println(analogy + " = " + cosineVal); } sc.close(); out.close(); } catch (Exception e) { System.err.println("Could not read file."); } } /** * Reads analogies from Standard In and outputs their cosine similarities to Standard Out. * * @param projection the projection {@code Matrix} * * @see #computeCosineSimilarity(String,Matrix) **/ public void evaluateAnalogies(Matrix projection) { try { Scanner sc = new Scanner(System.in); while (sc.hasNext()) { String analogy = sc.next(); if (!isAnalogyFormat(analogy, true)) { System.err.println("\"" + analogy + "\" not in proper format."); continue; } double cosineVal = computeCosineSimilarity(analogy, projection); //does the actual cosine value calculations and comparisons System.out.println(analogy + " = " + cosineVal); } sc.close(); } catch (Exception e) { System.err.println("Could not read file."); } } /** * prints the {@code Matrix} to standard out. * * @param rows an {@code int} containing the number of rows in m * @param cols an {@code int} containing the number of cols in m * @param m the {@code Matrix} to print * @return void **/ public static void printMatrix(int rows, int cols, Matrix m) { for (int col_num = 0; col_num < cols; col_num++) { for (int row_num = 0; row_num < rows; row_num++) { System.out.print(m.get(row_num, col_num) + " "); } System.out.print("\n"); } System.out.print("\n"); } /** * Checks whether the analogy is in the proper format. * An analogy is in proper format if it contains two {@code Strings} separated by * a colon (:) * * @param analogy a {@code String} containing the two pairs to compare. The analogy should be in the form A:B * @return true if the analogy is in proper format **/ public static boolean isAnalogyFormat(String analogy) { return isAnalogyFormat(analogy, false); } /** * Checks whether the analogy is in the proper format. * An analogy is in proper format if it contains two {@code Strings} separated by * a colon (:), or two colons (::) if it is a pair of analogies. * * @param analogy a {@code String} containing the two pairs to compare. * The analogy should be in the form A:B if it is not a pair, or * A:B::C:D if it is a pair of analogies. * @param pair true if it is a pair of analogies * @return true if the analogy is in proper format **/ public static boolean isAnalogyFormat(String analogy, boolean pair) { if (pair) { return analogy.matches("[\\w]+:[\\w]+::[\\w]+:[\\w]+"); } else { return analogy.matches("[\\w]+:[\\w]+"); } } /* //sample main function public static void main(String[] args) { System.err.println("skipping indexing step..."); String index= "/argos/lra/index_textbooks/"; String data= "/bigdisk/corpora/textbooks/"; LRA lra = new LRA(data,index,false); try { //load analogy input lra.loadAnalogiesFromFile("/home/chippoc/analogies.txt"); //3. Get patterns 4. Filter top NUM_PATTERNS lra.findPatterns(); //5. Map phrases to rows lra.mapRows(); //6. Map patterns to columns lra.mapColumns(); //7. Create sparse matrix Matrix sparse_matrix = lra.createSparseMatrix(); //8. Calculate entropy System.err.println("Calculating entropy..."); sparse_matrix = lra.applyEntropyTransformations(sparse_matrix); //printMatrix(sparse_matrix.rows(), sparse_matrix.columns(), sparse_matrix); //Matrix tmp_matrix = MatrixIO.readMatrix(rawTermDocMatrix, MatrixIO.Format.SVDLIBC_SPARSE_TEXT,Matrix.Type.SPARSE_IN_MEMORY); //printMatrix(tmp_matrix.rows(), tmp_matrix.columns(), tmp_matrix); //9. Compute SVD on the pre-processed matrix. int dimensions = 300; Matrix[] usv = lra.computeSVD(sparse_matrix, dimensions); //10. Compute projection matrix from U and S. Matrix projection = Matrices.multiply(usv[0],usv[1]); printMatrix(projection.rows(), projection.columns(), projection); System.err.println("Completed LRA...\n"); //11. Get analogy input and Evaluate Alternatives String inputFile = "/home/chippoc/testIn.txt"; String outputFile= "/home/chippoc/testOut.txt"; lra.evaluateAnalogies(projection, inputFile, outputFile); } catch (Exception e) { System.err.println("FAILURE"); } } */ }