Java tutorial
/* * Copyright 2014 Alpha Cephei Inc. * All Rights Reserved. Use is subject to license terms. * * See the file "license.terms" for information on usage and * redistribution of this file, and for a DISCLAIMER OF ALL * WARRANTIES. * */ package edu.cmu.sphinx.alignment; import static com.google.common.base.Functions.forMap; import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkNotNull; import static com.google.common.base.Predicates.in; import static com.google.common.base.Predicates.not; import static com.google.common.collect.Iterables.filter; import static com.google.common.collect.Iterables.limit; import static com.google.common.collect.Iterables.skip; import static com.google.common.collect.Lists.*; import static com.google.common.collect.Maps.newHashMap; import static com.google.common.collect.Sets.newHashSet; import static com.google.common.collect.Sets.newTreeSet; import static edu.cmu.sphinx.util.PriorityQueue.newPriorityQueue; import static java.lang.Math.abs; import static java.lang.Math.max; import static java.util.Arrays.fill; import static java.util.Collections.emptyList; import java.util.*; import com.google.common.base.Function; import com.google.common.base.Joiner; import com.google.common.collect.ArrayListMultimap; import com.google.common.collect.Multimap; import com.google.common.collect.Range; import edu.cmu.sphinx.util.PriorityQueue; /** * * @author Alexander Solovets */ public class LongTextAligner implements SequenceAligner<String> { private final class Alignment { public final class Node { private final int databaseIndex; private final int queryIndex; private Node(int row, int column) { this.databaseIndex = column; this.queryIndex = row; } public int getDatabaseIndex() { return shifts.get(databaseIndex - 1); } public int getQueryIndex() { return indices.get(queryIndex - 1); } public String getQueryWord() { if (queryIndex > 0) return query.get(getQueryIndex()); return null; } public String getDatabaseWord() { if (databaseIndex > 0) return reftup.get(getDatabaseIndex()); return null; } public int getValue() { if (isBoundary()) return max(queryIndex, databaseIndex); return hasMatch() ? 0 : 1; } public boolean hasMatch() { return getQueryWord().equals(getDatabaseWord()); } public boolean isBoundary() { return queryIndex == 0 || databaseIndex == 0; } public boolean isTarget() { return queryIndex == indices.size() && databaseIndex == shifts.size(); } public List<Node> adjacent() { List<Node> result = newArrayListWithCapacity(3); if (queryIndex < indices.size() && databaseIndex < shifts.size()) { result.add(new Node(queryIndex + 1, databaseIndex + 1)); } if (databaseIndex < shifts.size()) { result.add(new Node(queryIndex, databaseIndex + 1)); } if (queryIndex < indices.size()) { result.add(new Node(queryIndex + 1, databaseIndex)); } return result; } @Override public boolean equals(Object object) { if (!(object instanceof Node)) return false; Node other = (Node) object; return queryIndex == other.queryIndex && databaseIndex == other.databaseIndex; } @Override public int hashCode() { return 31 * (31 * queryIndex + databaseIndex); } @Override public String toString() { return String.format("[%d %d]", queryIndex, databaseIndex); } } private final List<Integer> shifts; private final List<String> query; private final List<Integer> indices; private final List<Node> alignment; public Alignment(List<String> query, Range<Integer> range) { this.query = query; indices = newArrayList(); Set<Integer> shiftSet = newTreeSet(); for (int i = 0; i < query.size(); ++i) { if (tupleIndex.containsKey(query.get(i))) indices.add(i); for (Integer shift : tupleIndex.get(query.get(i))) { if (range.contains(shift)) shiftSet.add(shift); } } shifts = newArrayList(shiftSet); Map<Node, Integer> cost = newHashMap(); Function<Node, Integer> priority = forMap(cost, Integer.MAX_VALUE); PriorityQueue<Node, Integer> openSet = newPriorityQueue(priority); Collection<Node> closedSet = newHashSet(); Map<Node, Node> parents = newHashMap(); Node startNode = new Node(0, 0); cost.put(startNode, 0); openSet.insert(startNode); while (!openSet.isEmpty()) { Node q = openSet.extractMin(); if (closedSet.contains(q)) continue; if (q.isTarget()) { List<Node> backtrace = newArrayList(); while (parents.containsKey(q)) { if (!q.isBoundary() && q.hasMatch()) backtrace.add(q); q = parents.get(q); } alignment = reverse(backtrace); return; } closedSet.add(q); for (Node nb : filter(q.adjacent(), not(in(closedSet)))) { // FIXME: move to appropriate location int l = abs(indices.size() - shifts.size() - q.queryIndex + q.databaseIndex) - abs(indices.size() - shifts.size() - nb.queryIndex + nb.databaseIndex); int oldScore = priority.apply(nb); int newScore = priority.apply(q) + nb.getValue() - l; if (newScore < oldScore) { cost.put(nb, newScore); openSet.insert(nb); parents.put(nb, q); } } } alignment = emptyList(); } public List<Node> getIndices() { return alignment; } } private final Joiner joiner; private final int tupleSize; private final List<String> reftup; private final Multimap<String, Integer> tupleIndex; private List<String> refWords; /** * Constructs new text aligner that servers requests for alignment of * sequence of words with the provided database sequence. Sequences are * aligned by tuples comprising one or more subsequent words. * * @param words list of words forming the database * @param tupleSize size of a tuple, must be greater or equal to 1 */ public LongTextAligner(List<String> words, int tupleSize) { checkNotNull(words, "word list must not be null"); checkArgument(tupleSize > 0, "tuple size must be greater than zero"); joiner = Joiner.on(' '); this.tupleSize = tupleSize; this.refWords = words; int offset = 0; reftup = getTuples(words); tupleIndex = ArrayListMultimap.create(); for (String tuple : reftup) tupleIndex.put(tuple, offset++); } /** * Aligns query sequence with the previously built database. * * @return indices of alignment */ public int[] align(List<String> query) { return align(query, Range.closed(0, refWords.size() - 1)); } /** * Aligns query sequence with the previously built database. * * @return indices of alignment */ public int[] align(List<String> words, Range<Integer> range) { if (range.upperEndpoint() - range.lowerEndpoint() < tupleSize || words.size() < tupleSize) { return new GlobalSequenceAligner<String>( refWords.subList(range.lowerEndpoint(), range.upperEndpoint() + 1), range.lowerEndpoint()) .align(words); } int[] result = new int[words.size()]; fill(result, -1); int lastIndex = 0; for (Alignment.Node node : new Alignment(getTuples(words), range).getIndices()) { // for (int j = 0; j < tupleSize; ++j) lastIndex = max(lastIndex, node.getQueryIndex()); for (; lastIndex < node.getQueryIndex() + tupleSize; ++lastIndex) result[lastIndex] = node.getDatabaseIndex() + lastIndex - node.getQueryIndex(); } return result; } /** * Makes list of tuples of the given size out of list of words. * * @param words words * @return list of tuples of size {@link #tupleSize} */ private List<String> getTuples(Iterable<String> words) { List<String> result = newArrayList(); LinkedList<String> tuple = newLinkedList(limit(words, tupleSize - 1)); for (String word : skip(words, tupleSize - 1)) { tuple.addLast(word); result.add(joiner.join(tuple)); tuple.removeFirst(); } return result; } }