Java tutorial
/* * Carrot2 project. * * Copyright (C) 2002-2010, Dawid Weiss, Stanisaw Osiski. * All rights reserved. * * Refer to the full license file "carrot2.LICENSE" * in the root folder of the repository checkout or at: * http://www.carrot2.org/carrot2.LICENSE */ package org.carrot2.text.vsm; import org.apache.commons.lang.ArrayUtils; import org.apache.mahout.math.GenericPermuting; import org.apache.mahout.math.matrix.DoubleFactory2D; import org.apache.mahout.math.matrix.*; import org.apache.mahout.math.matrix.impl.SparseDoubleMatrix2D; import org.carrot2.core.Document; import org.carrot2.core.attribute.Internal; import org.carrot2.core.attribute.Processing; import org.carrot2.matrix.MatrixUtils; import org.carrot2.matrix.NNIDoubleFactory2D; import org.carrot2.text.analysis.TokenTypeUtils; import org.carrot2.text.preprocessing.PreprocessingContext; import org.carrot2.util.attribute.Attribute; import org.carrot2.util.attribute.Bindable; import org.carrot2.util.attribute.Input; import org.carrot2.util.attribute.Required; import org.carrot2.util.attribute.constraint.DoubleRange; import org.carrot2.util.attribute.constraint.ImplementingClasses; import org.carrot2.util.attribute.constraint.IntRange; import com.carrotsearch.hppc.BitSet; import com.carrotsearch.hppc.IntIntOpenHashMap; import com.carrotsearch.hppc.sorting.IndirectComparator; import com.carrotsearch.hppc.sorting.IndirectSort; /** * Builds a term document matrix based on the provided {@link PreprocessingContext}. */ @SuppressWarnings("deprecation") @Bindable(prefix = "TermDocumentMatrixBuilder") public class TermDocumentMatrixBuilder { /** * Title word boost. Gives more weight to words that appeared in * {@link org.carrot2.core.Document#TITLE} fields. * * @level Medium * @group Labels * @label Title word boost */ @Input @Processing @Attribute @DoubleRange(min = 0, max = 10) public double titleWordsBoost = 2.0; /** * Maximum matrix size. The maximum number of the term-document matrix elements. The * larger the size, the more accurate, time- and memory-consuming clustering. * * @level Medium * @group Matrix model * @label Maximum matrix size */ @Input @Processing @Attribute @IntRange(min = 50 * 100) @Internal(configuration = true) public int maximumMatrixSize = 250 * 150; /** * Maximum word document frequency. The maximum document frequency allowed for words * as a fraction of all documents. Words with document frequency larger than * <code>maxWordDf</code> will be ignored. For example, when <code>maxWordDf</code> is * <code>0.4</code>, words appearing in more than 40% of documents will be be ignored. * A value of <code>1.0</code> means that all words will be taken into * account, no matter in how many documents they appear. * <p> * This attribute may be useful when certain words appear in most of the input * documents (e.g. company name from header or footer) and such words dominate the * cluster labels. In such case, setting <code>maxWordDf</code> to a value lower than * <code>1.0</code>, e.g. <code>0.9</code> may improve the clusters. * </p> * <p> * Another useful application of this attribute is when there is a need to generate * only very specific clusters, i.e. clusters containing small numbers of documents. * This can be achieved by setting <code>maxWordDf</code> to extremely low values, * e.g. <code>0.1</code> or <code>0.05</code>. * </p> * * @level Advanced * @group Matrix model * @label Maximum word document frequency */ @Input @Processing @Attribute @DoubleRange(min = 0.00, max = 1.0) public double maxWordDf = 0.9; /** * Term weighting. The method for calculating weight of words in the term-document * matrices. * * @level Advanced * @group Matrix model * @label Term weighting */ @Input @Processing @Attribute @Required @ImplementingClasses(classes = { LogTfIdfTermWeighting.class, LinearTfIdfTermWeighting.class, TfTermWeighting.class }, strict = false) public ITermWeighting termWeighting = new LogTfIdfTermWeighting(); /** * Builds a term document matrix from data provided in the <code>context</code>, * stores the result in there. */ public void buildTermDocumentMatrix(VectorSpaceModelContext vsmContext) { final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int documentCount = preprocessingContext.documents.size(); final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final byte[] stemsFieldIndices = preprocessingContext.allStems.fieldIndices; if (documentCount == 0) { vsmContext.termDocumentMatrix = NNIDoubleFactory2D.nni.make(0, 0); vsmContext.stemToRowIndex = new IntIntOpenHashMap(); return; } // Determine the index of the title field int titleFieldIndex = -1; final String[] fieldsName = preprocessingContext.allFields.name; for (int i = 0; i < fieldsName.length; i++) { if (Document.TITLE.equals(fieldsName[i])) { titleFieldIndex = i; break; } } // Determine the stems we, ideally, should include in the matrix int[] stemsToInclude = computeRequiredStemIndices(preprocessingContext); // Sort stems by weight, so that stems get included in the matrix in the order // of frequency final double[] stemsWeight = new double[stemsToInclude.length]; for (int i = 0; i < stemsToInclude.length; i++) { final int stemIndex = stemsToInclude[i]; stemsWeight[i] = termWeighting.calculateTermWeight(stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount) * getWeightBoost(titleFieldIndex, stemsFieldIndices[stemIndex]); } final int[] stemWeightOrder = IndirectSort.sort(0, stemsWeight.length, new IndirectComparator.DescendingDoubleComparator(stemsWeight)); // Calculate the number of terms we can include to fulfill the max matrix size final int maxRows = maximumMatrixSize / documentCount; final DoubleMatrix2D tdMatrix = NNIDoubleFactory2D.nni.make(Math.min(maxRows, stemsToInclude.length), documentCount); for (int i = 0; i < stemWeightOrder.length && i < maxRows; i++) { final int stemIndex = stemsToInclude[stemWeightOrder[i]]; final int[] tfByDocument = stemsTfByDocument[stemIndex]; final int df = tfByDocument.length / 2; final byte fieldIndices = stemsFieldIndices[stemIndex]; int tfByDocumentIndex = 0; for (int documentIndex = 0; documentIndex < documentCount; documentIndex++) { if (tfByDocumentIndex * 2 < tfByDocument.length && tfByDocument[tfByDocumentIndex * 2] == documentIndex) { double weight = termWeighting.calculateTermWeight(tfByDocument[tfByDocumentIndex * 2 + 1], df, documentCount); weight *= getWeightBoost(titleFieldIndex, fieldIndices); tfByDocumentIndex++; tdMatrix.set(i, documentIndex, weight); } } } // Convert stemsToInclude into tdMatrixStemIndices GenericPermuting.permute(stemsToInclude, stemWeightOrder); stemsToInclude = ArrayUtils.subarray(stemsToInclude, 0, tdMatrix.rows()); final IntIntOpenHashMap stemToRowIndex = new IntIntOpenHashMap(); for (int i = 0; i < stemsToInclude.length; i++) { stemToRowIndex.put(stemsToInclude[i], i); } // Store the results vsmContext.termDocumentMatrix = tdMatrix; vsmContext.stemToRowIndex = stemToRowIndex; } /** * Builds a term-phrase matrix in the same space as the main term-document matrix. If * the processing context contains no phrases, * {@link VectorSpaceModelContext#termPhraseMatrix} will remain <code>null</code>. */ public void buildTermPhraseMatrix(VectorSpaceModelContext context) { final PreprocessingContext preprocessingContext = context.preprocessingContext; final IntIntOpenHashMap stemToRowIndex = context.stemToRowIndex; final int[] labelsFeatureIndex = preprocessingContext.allLabels.featureIndex; final int firstPhraseIndex = preprocessingContext.allLabels.firstPhraseIndex; if (firstPhraseIndex >= 0 && stemToRowIndex.size() > 0) { // Build phrase matrix int[] phraseFeatureIndices = new int[labelsFeatureIndex.length - firstPhraseIndex]; for (int featureIndex = 0; featureIndex < phraseFeatureIndices.length; featureIndex++) { phraseFeatureIndices[featureIndex] = labelsFeatureIndex[featureIndex + firstPhraseIndex]; } final DoubleMatrix2D phraseMatrix = TermDocumentMatrixBuilder.buildAlignedMatrix(context, phraseFeatureIndices, termWeighting); MatrixUtils.normalizeColumnL2(phraseMatrix, null); context.termPhraseMatrix = phraseMatrix.viewDice(); } } /** * Calculates the boost we should apply to a stem, based on the field indices array. */ private double getWeightBoost(int titleFieldIndex, final byte fieldIndices) { if ((fieldIndices & (1 << titleFieldIndex)) != 0) { return titleWordsBoost; } return 1; } /** * Computes stem indices of words that are one-word label candidates or are non-stop * words from phrase label candidates. */ private int[] computeRequiredStemIndices(PreprocessingContext context) { final int[] labelsFeatureIndex = context.allLabels.featureIndex; final int[] wordsStemIndex = context.allWords.stemIndex; final short[] wordsTypes = context.allWords.type; final int[][] phrasesWordIndices = context.allPhrases.wordIndices; final int wordCount = wordsStemIndex.length; final int[][] stemsTfByDocument = context.allStems.tfByDocument; int documentCount = context.documents.size(); final BitSet requiredStemIndices = new BitSet(labelsFeatureIndex.length); for (int i = 0; i < labelsFeatureIndex.length; i++) { final int featureIndex = labelsFeatureIndex[i]; if (featureIndex < wordCount) { addStemIndex(wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, featureIndex); } else { final int[] wordIndices = phrasesWordIndices[featureIndex - wordCount]; for (int j = 0; j < wordIndices.length; j++) { final int wordIndex = wordIndices[j]; if (!TokenTypeUtils.isCommon(wordsTypes[wordIndex])) { addStemIndex(wordsStemIndex, documentCount, stemsTfByDocument, requiredStemIndices, wordIndex); } } } } return requiredStemIndices.asIntLookupContainer().toArray(); } /** * Adds stem index to the set with a check on the stem's document frequency. */ private void addStemIndex(final int[] wordsStemIndex, int documentCount, int[][] stemsTfByDocument, final BitSet requiredStemIndices, final int featureIndex) { final int stemIndex = wordsStemIndex[featureIndex]; final int df = stemsTfByDocument[stemIndex].length / 2; if (((double) df / documentCount) <= maxWordDf) { requiredStemIndices.set(stemIndex); } } /** * Builds a sparse term-document-like matrix for the provided matrixWordIndices in the * same term space as the original term-document matrix. */ static DoubleMatrix2D buildAlignedMatrix(VectorSpaceModelContext vsmContext, int[] featureIndex, ITermWeighting termWeighting) { final IntIntOpenHashMap stemToRowIndex = vsmContext.stemToRowIndex; if (featureIndex.length == 0) { return DoubleFactory2D.dense.make(stemToRowIndex.size(), 0); } final DoubleMatrix2D phraseMatrix = new SparseDoubleMatrix2D(stemToRowIndex.size(), featureIndex.length); final PreprocessingContext preprocessingContext = vsmContext.preprocessingContext; final int[] wordsStemIndex = preprocessingContext.allWords.stemIndex; final int[] stemsTf = preprocessingContext.allStems.tf; final int[][] stemsTfByDocument = preprocessingContext.allStems.tfByDocument; final int[][] phrasesWordIndices = preprocessingContext.allPhrases.wordIndices; final int documentCount = preprocessingContext.documents.size(); final int wordCount = wordsStemIndex.length; for (int i = 0; i < featureIndex.length; i++) { final int feature = featureIndex[i]; final int[] wordIndices; if (feature < wordCount) { wordIndices = new int[] { feature }; } else { wordIndices = phrasesWordIndices[feature - wordCount]; } for (int wordIndex = 0; wordIndex < wordIndices.length; wordIndex++) { final int stemIndex = wordsStemIndex[wordIndices[wordIndex]]; if (stemToRowIndex.containsKey(stemIndex)) { final int rowIndex = stemToRowIndex.lget(); double weight = termWeighting.calculateTermWeight(stemsTf[stemIndex], stemsTfByDocument[stemIndex].length / 2, documentCount); phraseMatrix.setQuick(rowIndex, i, weight); } } } return phraseMatrix; } }