List of usage examples for weka.core Instances attribute
publicAttribute attribute(String name)
From source file:adams.flow.transformer.WekaInstancesMerge.java
License:Open Source License
/** * Excludes attributes from the data./* ww w. jav a 2 s . c om*/ * * @param inst the data to process * @return the processed data */ protected Instances excludeAttributes(Instances inst) { Instances result; StringBuilder atts; int i; Remove filter; // determine attribute indices atts = new StringBuilder(); for (i = 0; i < inst.numAttributes(); i++) { if (inst.attribute(i).name().matches(m_ExcludedAttributes)) { if (atts.length() > 0) atts.append(","); atts.append((i + 1)); } } // filter data try { filter = new Remove(); filter.setAttributeIndices(atts.toString()); filter.setInvertSelection(m_InvertMatchingSense); filter.setInputFormat(inst); result = weka.filters.Filter.useFilter(inst, filter); } catch (Exception e) { result = inst; handleException("Error filtering data:", e); } return result; }
From source file:adams.flow.transformer.WekaInstancesMerge.java
License:Open Source License
/** * Prefixes the attributes./*from w w w. j ava 2 s. c o m*/ * * @param index the index of the dataset * @param inst the data to process * @return the processed data */ protected Instances prefixAttributes(Instances inst, int index) { Instances result; String prefix; ArrayList<Attribute> atts; int i; prefix = createPrefix(inst, index); // header atts = new ArrayList<>(); for (i = 0; i < inst.numAttributes(); i++) atts.add(inst.attribute(i).copy(prefix + inst.attribute(i).name())); // data result = new Instances(inst.relationName(), atts, inst.numInstances()); result.setClassIndex(inst.classIndex()); for (i = 0; i < inst.numInstances(); i++) result.add((Instance) inst.instance(i).copy()); return result; }
From source file:adams.flow.transformer.WekaInstancesMerge.java
License:Open Source License
/** * Prepares the data, prefixing attributes, removing columns, etc, before * merging it.// ww w.j ava2 s .com * * @param inst the data to process * @param index the 0-based index of the dataset being processed * @return the prepared data */ protected Instances prepareData(Instances inst, int index) { Instances result; result = inst; if (m_KeepOnlySingleUniqueID && !m_UniqueID.isEmpty() && (inst.attribute(m_UniqueID) != null)) { if (index > 0) m_UniqueIDAtts.add(createPrefix(inst, index) + m_UniqueID); } // exclude attributes if (m_ExcludedAttributes.length() > 0) result = excludeAttributes(result); // prefix if (m_UsePrefix) result = prefixAttributes(inst, index); return result; }
From source file:adams.flow.transformer.WekaInstancesMerge.java
License:Open Source License
/** * Updates the IDs in the hashset with the ones stored in the ID attribute * of the provided dataset.//from w w w . j a va 2s .c o m * * @param instIndex the dataset index * @param inst the dataset to obtain the IDs from * @param ids the hashset to store the IDs in */ protected void updateIDs(int instIndex, Instances inst, HashSet ids) { Attribute att; int i; boolean numeric; HashSet current; Object id; att = inst.attribute(m_UniqueID); if (att == null) throw new IllegalStateException("Attribute '" + m_UniqueID + "' not found in relation '" + inst.relationName() + "' (#" + (instIndex + 1) + ")!"); // determine/check type if (m_AttType == -1) { if ((att.type() == Attribute.NUMERIC) || (att.type() == Attribute.STRING)) m_AttType = att.type(); else throw new IllegalStateException("Attribute '" + m_UniqueID + "' must be either NUMERIC or STRING (#" + (instIndex + 1) + ")!"); } else { if (m_AttType != att.type()) throw new IllegalStateException("Attribute '" + m_UniqueID + "' must have same attribute type in all the datasets (#" + (instIndex + 1) + ")!"); } // get IDs numeric = m_AttType == Attribute.NUMERIC; current = new HashSet(); for (i = 0; i < inst.numInstances(); i++) { if (numeric) id = inst.instance(i).value(att); else id = inst.instance(i).stringValue(att); if (m_Strict && current.contains(id)) throw new IllegalStateException( "ID '" + id + "' is not unique in dataset #" + (instIndex + 1) + "!"); current.add(id); } ids.addAll(current); }
From source file:adams.flow.transformer.WekaInstancesMerge.java
License:Open Source License
/** * Merges the datasets based on the collected IDs. * * @param orig the original datasets//from w w w . ja va2 s . c om * @param inst the processed datasets to merge into one * @param ids the IDs for identifying the rows * @return the merged dataset */ protected Instances merge(Instances[] orig, Instances[] inst, HashSet ids) { Instances result; ArrayList<Attribute> atts; int i; int n; int m; int index; String relation; List sortedIDs; Attribute att; int[] indexStart; double value; double[] values; HashMap<Integer, Integer> hashmap; HashSet<Instance> hs; // create header if (isLoggingEnabled()) getLogger().info("Creating merged header..."); atts = new ArrayList<>(); relation = ""; indexStart = new int[inst.length]; for (i = 0; i < inst.length; i++) { indexStart[i] = atts.size(); for (n = 0; n < inst[i].numAttributes(); n++) atts.add((Attribute) inst[i].attribute(n).copy()); // assemble relation name if (i > 0) relation += "_"; relation += inst[i].relationName(); } result = new Instances(relation, atts, ids.size()); // fill with missing values if (isLoggingEnabled()) getLogger().info("Filling with missing values..."); for (i = 0; i < ids.size(); i++) { if (isStopped()) return null; // progress if (isLoggingEnabled() && ((i + 1) % 1000 == 0)) getLogger().info("" + (i + 1)); result.add(new DenseInstance(result.numAttributes())); } // sort IDs if (isLoggingEnabled()) getLogger().info("Sorting indices..."); sortedIDs = new ArrayList(ids); Collections.sort(sortedIDs); // generate rows hashmap = new HashMap<>(); for (i = 0; i < inst.length; i++) { if (isStopped()) return null; if (isLoggingEnabled()) getLogger().info("Adding file #" + (i + 1)); att = orig[i].attribute(m_UniqueID); for (n = 0; n < inst[i].numInstances(); n++) { // progress if (isLoggingEnabled() && ((n + 1) % 1000 == 0)) getLogger().info("" + (n + 1)); // determine index of row if (m_AttType == Attribute.NUMERIC) index = Collections.binarySearch(sortedIDs, inst[i].instance(n).value(att)); else index = Collections.binarySearch(sortedIDs, inst[i].instance(n).stringValue(att)); if (index < 0) throw new IllegalStateException( "Failed to determine index for row #" + (n + 1) + " of dataset #" + (i + 1) + "!"); if (!hashmap.containsKey(index)) hashmap.put(index, 0); hashmap.put(index, hashmap.get(index) + 1); // use internal representation for faster access values = result.instance(index).toDoubleArray(); // add attribute values for (m = 0; m < inst[i].numAttributes(); m++) { // missing value? if (inst[i].instance(n).isMissing(m)) continue; switch (inst[i].attribute(m).type()) { case Attribute.NUMERIC: case Attribute.DATE: case Attribute.NOMINAL: values[indexStart[i] + m] = inst[i].instance(n).value(m); break; case Attribute.STRING: value = result.attribute(indexStart[i] + m) .addStringValue(inst[i].instance(n).stringValue(m)); values[indexStart[i] + m] = value; break; case Attribute.RELATIONAL: value = result.attribute(indexStart[i] + m) .addRelation(inst[i].instance(n).relationalValue(m)); values[indexStart[i] + m] = value; break; default: throw new IllegalStateException("Unhandled attribute type: " + inst[i].attribute(m).type()); } } // update row result.set(index, new DenseInstance(1.0, values)); } } if (getRemove()) { hs = new HashSet<>(); for (Integer x : hashmap.keySet()) { if (hashmap.get(x) != inst.length) hs.add(result.get(x)); } result.removeAll(hs); } return result; }
From source file:adams.flow.transformer.WekaInstancesMerge.java
License:Open Source License
/** * Executes the flow item.//from w w w. j a v a 2s . c o m * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; String[] filesStr; File[] files; int i; Instances output; Instances[] orig; Instances[] inst; Instance[] rows; HashSet ids; int max; TIntList uniqueList; Remove remove; result = null; // get filenames files = null; orig = null; if (m_InputToken.getPayload() instanceof String[]) { filesStr = (String[]) m_InputToken.getPayload(); files = new File[filesStr.length]; for (i = 0; i < filesStr.length; i++) files[i] = new PlaceholderFile(filesStr[i]); } else if (m_InputToken.getPayload() instanceof File[]) { files = (File[]) m_InputToken.getPayload(); } else if (m_InputToken.getPayload() instanceof Instance[]) { rows = (Instance[]) m_InputToken.getPayload(); orig = new Instances[rows.length]; for (i = 0; i < rows.length; i++) { orig[i] = new Instances(rows[i].dataset(), 1); orig[i].add((Instance) rows[i].copy()); } } else if (m_InputToken.getPayload() instanceof Instances[]) { orig = (Instances[]) m_InputToken.getPayload(); } else { throw new IllegalStateException("Unhandled input type: " + m_InputToken.getPayload().getClass()); } try { output = null; // simple merge if (m_UniqueID.length() == 0) { if (files != null) { inst = new Instances[1]; for (i = 0; i < files.length; i++) { if (isStopped()) break; inst[0] = DataSource.read(files[i].getAbsolutePath()); inst[0] = prepareData(inst[0], i); if (i == 0) { output = inst[0]; } else { if (isLoggingEnabled()) getLogger().info("Merging with file #" + (i + 1) + ": " + files[i]); output = Instances.mergeInstances(output, inst[0]); } } } else if (orig != null) { inst = new Instances[1]; for (i = 0; i < orig.length; i++) { if (isStopped()) break; inst[0] = prepareData(orig[i], i); if (i == 0) { output = inst[0]; } else { if (isLoggingEnabled()) getLogger() .info("Merging with dataset #" + (i + 1) + ": " + orig[i].relationName()); output = Instances.mergeInstances(output, inst[0]); } } } } // merge based on row IDs else { m_AttType = -1; max = 0; m_UniqueIDAtts = new ArrayList<>(); if (files != null) { orig = new Instances[files.length]; for (i = 0; i < files.length; i++) { if (isStopped()) break; if (isLoggingEnabled()) getLogger().info("Loading file #" + (i + 1) + ": " + files[i]); orig[i] = DataSource.read(files[i].getAbsolutePath()); max = Math.max(max, orig[i].numInstances()); } } else if (orig != null) { for (i = 0; i < orig.length; i++) max = Math.max(max, orig[i].numInstances()); } inst = new Instances[orig.length]; ids = new HashSet(max); for (i = 0; i < orig.length; i++) { if (isStopped()) break; if (isLoggingEnabled()) getLogger().info("Updating IDs #" + (i + 1)); updateIDs(i, orig[i], ids); if (isLoggingEnabled()) getLogger().info("Preparing dataset #" + (i + 1)); inst[i] = prepareData(orig[i], i); } output = merge(orig, inst, ids); // remove unnecessary unique ID attributes if (m_KeepOnlySingleUniqueID) { uniqueList = new TIntArrayList(); for (String att : m_UniqueIDAtts) uniqueList.add(output.attribute(att).index()); if (uniqueList.size() > 0) { if (isLoggingEnabled()) getLogger().info("Removing duplicate unique ID attributes: " + m_UniqueIDAtts); remove = new Remove(); remove.setAttributeIndicesArray(uniqueList.toArray()); remove.setInputFormat(output); output = Filter.useFilter(output, remove); } } } if (!isStopped()) { m_OutputToken = new Token(output); updateProvenance(m_OutputToken); } } catch (Exception e) { result = handleException("Failed to merge: ", e); } return result; }
From source file:adams.flow.transformer.WekaInstancesStatistic.java
License:Open Source License
/** * Executes the flow item.// w w w. j av a 2 s. co m * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; SpreadSheet sheet; Instances data; int i; int n; Index index; AbstractArrayStatistic stat; result = null; try { sheet = null; data = (Instances) m_InputToken.getPayload(); stat = m_Statistic.shallowCopy(true); for (i = 0; i < m_Locations.length; i++) { switch (m_DataType) { case ROW_BY_INDEX: index = new Index(m_Locations[i].stringValue()); index.setMax(data.numInstances()); stat.add(StatUtils.toNumberArray(data.instance(index.getIntIndex()).toDoubleArray())); break; case COLUMN_BY_INDEX: index = new WekaAttributeIndex(m_Locations[i].stringValue()); ((WekaAttributeIndex) index).setData(data); stat.add(StatUtils.toNumberArray(data.attributeToDoubleArray(index.getIntIndex()))); break; case COLUMN_BY_REGEXP: for (n = 0; n < data.numAttributes(); n++) { if (data.attribute(n).name().matches(m_Locations[i].stringValue())) { stat.add(StatUtils.toNumberArray(data.attributeToDoubleArray(n))); break; } } break; default: throw new IllegalStateException("Unhandled data type: " + m_DataType); } } sheet = stat.calculate().toSpreadSheet(); } catch (Exception e) { result = handleException("Error generating the statistic: ", e); sheet = null; } if (sheet != null) m_OutputToken = new Token(sheet); return result; }
From source file:adams.flow.transformer.WekaMultiLabelSplitter.java
License:Open Source License
/** * Returns the generated token.//w w w. j a v a 2 s. c om * * @return the generated token */ @Override public Token output() { Token result; int index; Remove remove; Reorder reorder; StringBuilder indices; int i; int newIndex; Instances processed; result = null; index = m_AttributesToProcess.remove(0); remove = new Remove(); indices = new StringBuilder(); for (i = 0; i < m_ClassAttributes.size(); i++) { if (m_ClassAttributes.get(i) == index) continue; if (indices.length() > 0) indices.append(","); indices.append("" + (m_ClassAttributes.get(i) + 1)); } remove.setAttributeIndices(indices.toString()); try { remove.setInputFormat(m_Dataset); processed = weka.filters.Filter.useFilter(m_Dataset, remove); if (m_UpdateRelationName) processed.setRelationName(m_Dataset.attribute(index).name()); result = new Token(processed); } catch (Exception e) { processed = null; handleException( "Failed to process dataset with following filter setup:\n" + OptionUtils.getCommandLine(remove), e); } if (m_MakeClassLast && (processed != null)) { newIndex = processed.attribute(m_Dataset.attribute(index).name()).index(); indices = new StringBuilder(); for (i = 0; i < processed.numAttributes(); i++) { if (i == newIndex) continue; if (indices.length() > 0) indices.append(","); indices.append("" + (i + 1)); } if (indices.length() > 0) indices.append(","); indices.append("" + (newIndex + 1)); reorder = new Reorder(); try { reorder.setAttributeIndices(indices.toString()); reorder.setInputFormat(processed); processed = weka.filters.Filter.useFilter(processed, reorder); if (m_UpdateRelationName) processed.setRelationName(m_Dataset.attribute(index).name()); result = new Token(processed); } catch (Exception e) { handleException("Failed to process dataset with following filter setup:\n" + OptionUtils.getCommandLine(reorder), e); } } return result; }
From source file:adams.flow.transformer.WekaRegexToRange.java
License:Open Source License
/** * Executes the flow item./*from w ww. j a va2 s. c o m*/ * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; String range; Instances inst; result = null; range = ""; if (m_InputToken.getPayload() instanceof Instances) inst = (Instances) m_InputToken.getPayload(); else inst = ((Instance) m_InputToken.getPayload()).dataset(); int firstInRange = Integer.MIN_VALUE; int lastInRange = Integer.MIN_VALUE; int last = Integer.MIN_VALUE; for (int i = 0; i < inst.numAttributes(); i++) { if (match(inst.attribute(i).name())) { if (i == last + 1) { lastInRange = i; } else { if (firstInRange != Integer.MIN_VALUE) { if (!range.equals("")) { range += ","; } if (firstInRange - lastInRange == 0) { range += "" + (firstInRange + 1); } else { range += "" + (firstInRange + 1) + "-" + (lastInRange + 1); } } firstInRange = i; lastInRange = i; } last = i; } } if (!range.equals("")) { range += ","; } if (firstInRange < 0) { range = ""; } else if (lastInRange < 0 || lastInRange == firstInRange) { range += "" + (firstInRange + 1); } else { range += "" + (firstInRange + 1) + "-" + (lastInRange + 1); } m_OutputToken = new Token(range); return result; }
From source file:adams.flow.transformer.WekaReorderAttributesToReference.java
License:Open Source License
/** * Executes the flow item./*from w w w .ja v a 2 s . co m*/ * * @return null if everything is fine, otherwise error message */ @Override protected String doExecute() { String result; Instances dataOld; Instance instOld; Instances dataNew; Instance instNew; Attribute att; int i; StringBuilder order; List<Add> adds; Add add; int index; StringBuilder labels; int n; List<Filter> filters; Reorder reorder; result = null; if (m_OnTheFly && (m_Reference == null)) { result = setUpReference(); if (result != null) return result; } dataNew = null; instNew = null; // get input data if (m_InputToken.getPayload() instanceof Instance) { instOld = (Instance) m_InputToken.getPayload(); dataOld = instOld.dataset(); } else { instOld = null; dataOld = (Instances) m_InputToken.getPayload(); } // do we need to initialize filter? if (m_InitializeOnce || (m_Reorder == null)) { // check incoming data if (!m_Lenient) { for (i = 0; i < m_Reference.numAttributes(); i++) { att = m_Reference.attribute(i); if (dataOld.attribute(att.name()) == null) { if (result == null) result = "Missing attribute(s) in incoming data: " + att.name(); else result += ", " + att.name(); } } if (result != null) getLogger().severe(result); } if (result == null) { try { // determine indices order = new StringBuilder(); adds = new ArrayList<Add>(); for (i = 0; i < m_Reference.numAttributes(); i++) { att = m_Reference.attribute(i); if (dataOld.attribute(att.name()) == null) { index = dataOld.numAttributes() + adds.size(); add = new Add(); add.setAttributeIndex("last"); add.setAttributeName(att.name()); add.setAttributeType(new SelectedTag(att.type(), Add.TAGS_TYPE)); if (att.isNominal()) { labels = new StringBuilder(); for (n = 0; n < att.numValues(); n++) { if (labels.length() > 0) labels.append(","); labels.append(att.value(n)); } add.setNominalLabels(labels.toString()); } adds.add(add); } else { index = dataOld.attribute(att.name()).index(); } if (order.length() > 0) order.append(","); order.append((index + 1)); } // build reorder filter reorder = new Reorder(); reorder.setAttributeIndices(order.toString()); // build multifilter filters = new ArrayList<Filter>(); filters.addAll(adds); filters.add(reorder); m_Reorder = new MultiFilter(); m_Reorder.setFilters(filters.toArray(new Filter[filters.size()])); // initialize filter m_Reorder.setInputFormat(dataOld); } catch (Exception e) { result = handleException("Failed to initialize reorder filter!", e); } } } // reorder data if (result == null) { try { if (instOld != null) { m_Reorder.input(instOld); m_Reorder.batchFinished(); instNew = m_Reorder.output(); if (m_KeepRelationName) instNew.dataset().setRelationName(dataOld.relationName()); } else { dataNew = Filter.useFilter(dataOld, m_Reorder); if (m_KeepRelationName) dataNew.setRelationName(dataOld.relationName()); } } catch (Exception e) { result = handleException("Failed to reorder data!", e); instNew = null; dataNew = null; } } if (instNew != null) m_OutputToken = new Token(instNew); else if (dataNew != null) m_OutputToken = new Token(dataNew); return result; }