package weka.filters.unsupervised.attribute;

import java.io.Serializable;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.StringTokenizer;
import java.util.TreeMap;
import java.util.Vector;
import weka.classifiers.lazy.kstar.KStarConstants;
import weka.core.Attribute;
import weka.core.FastVector;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.OptionHandler;
import weka.core.Range;
import weka.core.SparseInstance;
import weka.core.Stopwords;
import weka.core.Utils;
import weka.filters.Filter;
import weka.filters.UnsupervisedFilter;
import weka.gui.visualize.Plot2D;

/* loaded from: input_file:weka/filters/unsupervised/attribute/StringToWordVector.class */
public class StringToWordVector extends Filter implements UnsupervisedFilter, OptionHandler {
    private String delimiters;
    protected Range m_SelectedRange;
    private TreeMap m_Dictionary;
    private boolean m_FirstBatchDone;
    private boolean m_OutputCounts;
    private String m_Prefix;
    private int[] docsCounts;
    private int numInstances;
    private double avgDocLength;
    private int m_WordsToKeep;
    private boolean m_TFTransform;
    private boolean m_normalizeDocLength;
    private boolean m_IDFTransform;
    private boolean m_onlyAlphabeticTokens;
    private boolean m_lowerCaseTokens;
    private boolean m_useStoplist;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:weka/filters/unsupervised/attribute/StringToWordVector$AlphabeticStringTokenizer.class */
    public class AlphabeticStringTokenizer implements Enumeration {
        private char[] str;
        int currentPos = 0;
        private final StringToWordVector this$0;

        public AlphabeticStringTokenizer(StringToWordVector stringToWordVector, String str) {
            this.this$0 = stringToWordVector;
            this.str = new char[str.length()];
            str.getChars(0, str.length(), this.str, 0);
        }

        @Override // java.util.Enumeration
        public boolean hasMoreElements() {
            int i = this.currentPos;
            while (i < this.str.length && ((this.str[i] < 'a' || this.str[i] > 'z') && (this.str[i] < 'A' || this.str[i] > 'Z'))) {
                i++;
            }
            this.currentPos = i;
            if (i >= this.str.length) {
                return false;
            }
            if (this.str[i] < 'a' || this.str[i] > 'z') {
                return this.str[i] >= 'A' && this.str[i] <= 'Z';
            }
            return true;
        }

        @Override // java.util.Enumeration
        public Object nextElement() {
            int i = this.currentPos;
            while (i < this.str.length && this.str[i] < 'a' && this.str[i] > 'z' && this.str[i] < 'A' && this.str[i] > 'Z') {
                i++;
            }
            int i2 = i;
            int i3 = i2;
            this.currentPos = i2;
            if (i >= this.str.length) {
                throw new NoSuchElementException("no more tokens present");
            }
            while (i3 < this.str.length && ((this.str[i3] >= 'a' && this.str[i3] <= 'z') || (this.str[i3] >= 'A' && this.str[i3] <= 'Z'))) {
                i3++;
            }
            String str = new String(this.str, i, i3 - this.currentPos);
            this.currentPos = i3;
            return str;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:weka/filters/unsupervised/attribute/StringToWordVector$Count.class */
    public class Count implements Serializable {
        public int count;
        public int docCount;
        private final StringToWordVector this$0;

        public Count(StringToWordVector stringToWordVector, int i) {
            this.this$0 = stringToWordVector;
            this.count = i;
        }
    }

    @Override // weka.core.OptionHandler
    public Enumeration listOptions() {
        Vector vector = new Vector(3);
        vector.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", "C", 0, "-C"));
        vector.addElement(new Option("\tString containing the set of delimiter characters\n\t(default: \" \\n\\t.,:'\\\"()?!\")", "D", 1, "-D <delimiter set>"));
        vector.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", "R", 1, "-R <index1,index2-index4,...>"));
        vector.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        vector.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        vector.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        vector.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of  documents containing word i)\n\t  where fij if frequency of word i in  jth document(instance)", "I", 0, "-I"));
        vector.addElement(new Option("\tNormalize word frequencies of each document(instance) to average length of documents.", "N", 0, "-N"));
        vector.addElement(new Option("\tOnly form tokens from contiguous alphabetic sequences (The delimiter string is ignored if this is set).", "A", 0, "-A"));
        vector.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        vector.addElement(new Option("\tIgnore words that are in the stoplist.", "S", 0, "-S"));
        return vector.elements();
    }

    @Override // weka.core.OptionHandler
    public void setOptions(String[] strArr) throws Exception {
        String option = Utils.getOption('D', strArr);
        if (option.length() != 0) {
            setDelimiters(option);
        }
        String option2 = Utils.getOption('R', strArr);
        if (option2.length() != 0) {
            setSelectedRange(option2);
        }
        String option3 = Utils.getOption('P', strArr);
        if (option3.length() != 0) {
            setAttributeNamePrefix(option3);
        }
        String option4 = Utils.getOption('W', strArr);
        if (option4.length() != 0) {
            setWordsToKeep(Integer.valueOf(option4).intValue());
        }
        setOutputWordCounts(Utils.getFlag('C', strArr));
        setTFTransform(Utils.getFlag('T', strArr));
        setIDFTransform(Utils.getFlag('I', strArr));
        setNormalizeDocLength(Utils.getFlag('N', strArr));
        setLowerCaseTokens(Utils.getFlag('L', strArr));
        setOnlyAlphabeticTokens(Utils.getFlag('A', strArr));
        setUseStoplist(Utils.getFlag('S', strArr));
    }

    @Override // weka.core.OptionHandler
    public String[] getOptions() {
        String[] strArr = new String[16];
        int i = 0 + 1;
        strArr[0] = "-D";
        int i2 = i + 1;
        strArr[i] = getDelimiters();
        if (getSelectedRange() != null) {
            int i3 = i2 + 1;
            strArr[i2] = "-R";
            this.m_SelectedRange.setUpper(getInputFormat().numAttributes() - 1);
            i2 = i3 + 1;
            strArr[i3] = getSelectedRange().getRanges();
        }
        if (!"".equals(getAttributeNamePrefix())) {
            int i4 = i2;
            int i5 = i2 + 1;
            strArr[i4] = "-P";
            i2 = i5 + 1;
            strArr[i5] = getAttributeNamePrefix();
        }
        int i6 = i2;
        int i7 = i2 + 1;
        strArr[i6] = "-W";
        int i8 = i7 + 1;
        strArr[i7] = String.valueOf(getWordsToKeep());
        if (getOutputWordCounts()) {
            i8++;
            strArr[i8] = "-C";
        }
        if (getTFTransform()) {
            int i9 = i8;
            i8++;
            strArr[i9] = "-T";
        }
        if (getIDFTransform()) {
            int i10 = i8;
            i8++;
            strArr[i10] = "-I";
        }
        if (getNormalizeDocLength()) {
            int i11 = i8;
            i8++;
            strArr[i11] = "-N";
        }
        if (getLowerCaseTokens()) {
            int i12 = i8;
            i8++;
            strArr[i12] = "-L";
        }
        if (getOnlyAlphabeticTokens()) {
            int i13 = i8;
            i8++;
            strArr[i13] = "-A";
        }
        if (getUseStoplist()) {
            int i14 = i8;
            i8++;
            strArr[i14] = "-S";
        }
        while (i8 < strArr.length) {
            int i15 = i8;
            i8++;
            strArr[i15] = "";
        }
        return strArr;
    }

    public StringToWordVector() {
        this.delimiters = " \n\t.,:'\"()?!";
        this.m_SelectedRange = null;
        this.m_Dictionary = new TreeMap();
        this.m_FirstBatchDone = false;
        this.m_OutputCounts = false;
        this.m_Prefix = "";
        this.numInstances = -1;
        this.avgDocLength = -1.0d;
        this.m_WordsToKeep = Plot2D.ERROR_SHAPE;
    }

    public StringToWordVector(int i) {
        this.delimiters = " \n\t.,:'\"()?!";
        this.m_SelectedRange = null;
        this.m_Dictionary = new TreeMap();
        this.m_FirstBatchDone = false;
        this.m_OutputCounts = false;
        this.m_Prefix = "";
        this.numInstances = -1;
        this.avgDocLength = -1.0d;
        this.m_WordsToKeep = Plot2D.ERROR_SHAPE;
        this.m_WordsToKeep = i;
    }

    @Override // weka.filters.Filter
    public boolean setInputFormat(Instances instances) throws Exception {
        super.setInputFormat(instances);
        this.m_FirstBatchDone = false;
        return false;
    }

    @Override // weka.filters.Filter
    public boolean input(Instance instance) throws Exception {
        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (this.m_NewBatch) {
            resetQueue();
            this.m_NewBatch = false;
        }
        if (this.m_FirstBatchDone) {
            convertInstance(instance);
            return true;
        }
        bufferInput(instance);
        return false;
    }

    @Override // weka.filters.Filter
    public boolean batchFinished() throws Exception {
        if (getInputFormat() == null) {
            throw new IllegalStateException("No input instance format defined");
        }
        if (!this.m_FirstBatchDone) {
            determineDictionary();
        }
        if (!this.m_normalizeDocLength || this.m_FirstBatchDone) {
            for (int i = 0; i < getInputFormat().numInstances(); i++) {
                convertInstance(getInputFormat().instance(i));
            }
            flushInput();
        } else {
            FastVector fastVector = new FastVector();
            int i2 = 0;
            Instances inputFormat = getInputFormat();
            this.avgDocLength = KStarConstants.FLOOR;
            for (int i3 = 0; i3 < inputFormat.numInstances(); i3++) {
                i2 = convertInstancewoDocNorm(inputFormat.instance(i3), fastVector);
            }
            for (int i4 = 0; i4 < fastVector.size(); i4++) {
                Instance instance = (Instance) fastVector.elementAt(i4);
                double d = 0.0d;
                for (int i5 = 0; i5 < instance.numValues(); i5++) {
                    if (instance.index(i5) >= i2) {
                        double valueSparse = instance.valueSparse(i5);
                        d += valueSparse * valueSparse;
                    }
                }
                double sqrt = Math.sqrt(d);
                this.avgDocLength += sqrt;
                int i6 = 0;
                while (i6 < instance.numValues()) {
                    if (instance.index(i6) >= i2) {
                        double valueSparse2 = instance.valueSparse(i6) / sqrt;
                        instance.setValueSparse(i6, valueSparse2);
                        if (valueSparse2 == KStarConstants.FLOOR) {
                            System.err.println(new StringBuffer().append("setting value ").append(instance.index(i6)).append(" to zero.").toString());
                            i6--;
                        }
                    }
                    i6++;
                }
            }
            this.avgDocLength /= inputFormat.numInstances();
            for (int i7 = 0; i7 < fastVector.size(); i7++) {
                Instance instance2 = (Instance) fastVector.elementAt(i7);
                int i8 = 0;
                while (i8 < instance2.numValues()) {
                    if (instance2.index(i8) >= i2) {
                        double valueSparse3 = instance2.valueSparse(i8) * this.avgDocLength;
                        instance2.setValueSparse(i8, valueSparse3);
                        if (valueSparse3 == KStarConstants.FLOOR) {
                            System.err.println(new StringBuffer().append("setting value ").append(instance2.index(i8)).append(" to zero.").toString());
                            i8--;
                        }
                    }
                    i8++;
                }
                push(instance2);
            }
            flushInput();
        }
        this.m_NewBatch = true;
        this.m_FirstBatchDone = true;
        return numPendingOutput() != 0;
    }

    public String globalInfo() {
        return "Converts String attributes into a set of attributes representing word occurrence information from the text contained in the strings. The set of words (attributes) is determined by the first batch filtered (typically training data).";
    }

    public boolean getOutputWordCounts() {
        return this.m_OutputCounts;
    }

    public void setOutputWordCounts(boolean z) {
        this.m_OutputCounts = z;
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public String getDelimiters() {
        return this.delimiters;
    }

    public void setDelimiters(String str) {
        this.delimiters = str;
    }

    public String delimitersTipText() {
        return "Set of delimiter characters to use in tokenizing (default: \" \\n\\t.,:'\\\"()?!\"). This option is ignored if onlyAlphabeticTokens option is set to true.";
    }

    public Range getSelectedRange() {
        return this.m_SelectedRange;
    }

    public void setSelectedRange(String str) {
        this.m_SelectedRange = new Range(str);
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    public void setAttributeNamePrefix(String str) {
        this.m_Prefix = str;
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public int getWordsToKeep() {
        return this.m_WordsToKeep;
    }

    public void setWordsToKeep(int i) {
        this.m_WordsToKeep = i;
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public void setTFTransform(boolean z) {
        this.m_TFTransform = z;
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public void setIDFTransform(boolean z) {
        this.m_IDFTransform = z;
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getNormalizeDocLength() {
        return this.m_normalizeDocLength;
    }

    public void setNormalizeDocLength(boolean z) {
        this.m_normalizeDocLength = z;
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getOnlyAlphabeticTokens() {
        return this.m_onlyAlphabeticTokens;
    }

    public void setOnlyAlphabeticTokens(boolean z) {
        this.m_onlyAlphabeticTokens = z;
    }

    public String onlyAlphabeticTokensTipText() {
        return "Sets whether if the word tokens are to be formed only from contiguous alphabetic sequences (The delimiter string is ignored if this option is set to true).";
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public void setLowerCaseTokens(boolean z) {
        this.m_lowerCaseTokens = z;
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public boolean getUseStoplist() {
        return this.m_useStoplist;
    }

    public void setUseStoplist(boolean z) {
        this.m_useStoplist = z;
    }

    public String useStoplistTipText() {
        return "Ignores all the words that are on the stoplist, if set to true.";
    }

    private static void sortArray(int[] iArr) {
        int i;
        int i2;
        int length = iArr.length - 1;
        int i3 = 1;
        while (true) {
            i = i3;
            if (i > length / 9) {
                break;
            } else {
                i3 = (3 * i) + 1;
            }
        }
        while (i > 0) {
            for (int i4 = i + 1; i4 <= length; i4++) {
                int i5 = iArr[i4];
                int i6 = i4;
                while (true) {
                    i2 = i6;
                    if (i2 > i && iArr[i2 - i] > i5) {
                        iArr[i2] = iArr[i2 - i];
                        i6 = i2 - i;
                    }
                }
                iArr[i2] = i5;
            }
            i /= 3;
        }
    }

    private void determineSelectedRange() {
        Instances inputFormat = getInputFormat();
        if (this.m_SelectedRange == null) {
            StringBuffer stringBuffer = new StringBuffer();
            for (int i = 0; i < inputFormat.numAttributes(); i++) {
                if (inputFormat.attribute(i).type() == 2) {
                    stringBuffer.append(new StringBuffer().append(i + 1).append(",").toString());
                }
            }
            this.m_SelectedRange = new Range(stringBuffer.toString());
        }
        this.m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
        StringBuffer stringBuffer2 = new StringBuffer();
        for (int i2 = 0; i2 < inputFormat.numAttributes(); i2++) {
            if (this.m_SelectedRange.isInRange(i2) && inputFormat.attribute(i2).type() == 2) {
                stringBuffer2.append(new StringBuffer().append(i2 + 1).append(",").toString());
            }
        }
        this.m_SelectedRange.setRanges(stringBuffer2.toString());
        this.m_SelectedRange.setUpper(inputFormat.numAttributes() - 1);
    }

    private void determineDictionary() {
        int classIndex = getInputFormat().classIndex();
        int numValues = classIndex != -1 ? getInputFormat().attribute(classIndex).numValues() : 1;
        TreeMap[] treeMapArr = new TreeMap[numValues];
        for (int i = 0; i < numValues; i++) {
            treeMapArr[i] = new TreeMap();
        }
        determineSelectedRange();
        for (int i2 = 0; i2 < getInputFormat().numInstances(); i2++) {
            Instance instance = getInputFormat().instance(i2);
            int classValue = classIndex != -1 ? (int) instance.classValue() : 0;
            Hashtable hashtable = new Hashtable();
            for (int i3 = 0; i3 < instance.numAttributes(); i3++) {
                if (this.m_SelectedRange.isInRange(i3) && !instance.isMissing(i3)) {
                    Enumeration stringTokenizer = !this.m_onlyAlphabeticTokens ? new StringTokenizer(instance.stringValue(i3), this.delimiters) : new AlphabeticStringTokenizer(this, instance.stringValue(i3));
                    while (stringTokenizer.hasMoreElements()) {
                        String intern = ((String) stringTokenizer.nextElement()).intern();
                        if (this.m_lowerCaseTokens) {
                            intern = intern.toLowerCase();
                        }
                        if (!this.m_useStoplist || !Stopwords.isStopword(intern)) {
                            if (!hashtable.contains(intern)) {
                                hashtable.put(intern, new Integer(0));
                            }
                            Count count = (Count) treeMapArr[classValue].get(intern);
                            if (count == null) {
                                treeMapArr[classValue].put(intern, new Count(this, 1));
                            } else {
                                count.count++;
                            }
                        }
                    }
                }
            }
            Enumeration keys = hashtable.keys();
            while (keys.hasMoreElements()) {
                Count count2 = (Count) treeMapArr[classValue].get((String) keys.nextElement());
                if (count2 != null) {
                    count2.docCount++;
                } else {
                    System.err.println("Warning: A word should definitely be in the dictionary.Please check the code");
                }
            }
        }
        int i4 = 0;
        int[] iArr = new int[numValues];
        for (int i5 = 0; i5 < numValues; i5++) {
            i4 += treeMapArr[i5].size();
            int[] iArr2 = new int[treeMapArr[i5].size()];
            int i6 = 0;
            Iterator it = treeMapArr[i5].keySet().iterator();
            while (it.hasNext()) {
                iArr2[i6] = ((Count) treeMapArr[i5].get((String) it.next())).count;
                i6++;
            }
            sortArray(iArr2);
            if (iArr2.length < this.m_WordsToKeep) {
                iArr[i5] = 1;
            } else {
                iArr[i5] = Math.max(1, iArr2[iArr2.length - this.m_WordsToKeep]);
            }
        }
        FastVector fastVector = new FastVector(i4 + getInputFormat().numAttributes());
        int i7 = -1;
        for (int i8 = 0; i8 < getInputFormat().numAttributes(); i8++) {
            if (!this.m_SelectedRange.isInRange(i8)) {
                if (getInputFormat().classIndex() == i8) {
                    i7 = fastVector.size();
                }
                fastVector.addElement(getInputFormat().attribute(i8).copy());
            }
        }
        TreeMap treeMap = new TreeMap();
        int size = fastVector.size();
        for (int i9 = 0; i9 < numValues; i9++) {
            for (String str : treeMapArr[i9].keySet()) {
                if (((Count) treeMapArr[i9].get(str)).count >= iArr[i9] && treeMap.get(str) == null) {
                    int i10 = size;
                    size++;
                    treeMap.put(str, new Integer(i10));
                    fastVector.addElement(new Attribute(new StringBuffer().append(this.m_Prefix).append(str).toString()));
                }
            }
        }
        this.docsCounts = new int[fastVector.size()];
        for (String str2 : treeMap.keySet()) {
            int intValue = ((Integer) treeMap.get(str2)).intValue();
            int i11 = 0;
            for (int i12 = 0; i12 < numValues; i12++) {
                Count count3 = (Count) treeMapArr[i12].get(str2);
                if (count3 != null) {
                    i11 += count3.docCount;
                }
            }
            this.docsCounts[intValue] = i11;
        }
        fastVector.trimToSize();
        this.m_Dictionary = treeMap;
        this.numInstances = getInputFormat().numInstances();
        Instances instances = new Instances(getInputFormat().relationName(), fastVector, 0);
        instances.setClassIndex(i7);
        setOutputFormat(instances);
    }

    private void convertInstance(Instance instance) throws Exception {
        TreeMap treeMap = new TreeMap();
        int i = 0;
        for (int i2 = 0; i2 < getInputFormat().numAttributes(); i2++) {
            if (!this.m_SelectedRange.isInRange(i2)) {
                if (getInputFormat().attribute(i2).type() != 2) {
                    if (instance.value(i2) != KStarConstants.FLOOR) {
                        treeMap.put(new Integer(i), new Double(instance.value(i2)));
                    }
                } else if (instance.isMissing(i2)) {
                    treeMap.put(new Integer(i), new Double(Instance.missingValue()));
                } else {
                    if (outputFormatPeek().attribute(i).numValues() == 0) {
                        outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                    }
                    treeMap.put(new Integer(i), new Double(outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i2))));
                }
                i++;
            }
        }
        for (int i3 = 0; i3 < instance.numAttributes(); i3++) {
            if (this.m_SelectedRange.isInRange(i3) && !instance.isMissing(i3)) {
                Enumeration stringTokenizer = !this.m_onlyAlphabeticTokens ? new StringTokenizer(instance.stringValue(i3), this.delimiters) : new AlphabeticStringTokenizer(this, instance.stringValue(i3));
                while (stringTokenizer.hasMoreElements()) {
                    String str = (String) stringTokenizer.nextElement();
                    if (this.m_lowerCaseTokens) {
                        str = str.toLowerCase();
                    }
                    Integer num = (Integer) this.m_Dictionary.get(str);
                    if (num != null) {
                        if (this.m_OutputCounts) {
                            Double d = (Double) treeMap.get(num);
                            if (d != null) {
                                treeMap.put(num, new Double(d.doubleValue() + 1.0d));
                            } else {
                                treeMap.put(num, new Double(1.0d));
                            }
                        } else {
                            treeMap.put(num, new Double(1.0d));
                        }
                    }
                }
            }
        }
        if (this.m_TFTransform) {
            int i4 = 0;
            for (Integer num2 : treeMap.keySet()) {
                if (num2.intValue() >= i) {
                    treeMap.put(num2, new Double(Math.log(((Double) treeMap.get(num2)).doubleValue() + 1.0d)));
                }
                i4++;
            }
        }
        if (this.m_IDFTransform) {
            int i5 = 0;
            for (Integer num3 : treeMap.keySet()) {
                if (num3.intValue() >= i) {
                    treeMap.put(num3, new Double(((Double) treeMap.get(num3)).doubleValue() * Math.log(this.numInstances / this.docsCounts[num3.intValue()])));
                }
                i5++;
            }
        }
        if (this.m_normalizeDocLength) {
            if (this.avgDocLength < KStarConstants.FLOOR) {
                throw new Exception("Error. Average Doc Length not defined yet.");
            }
            double d2 = 0.0d;
            int i6 = 0;
            for (Integer num4 : treeMap.keySet()) {
                if (num4.intValue() >= i) {
                    double doubleValue = ((Double) treeMap.get(num4)).doubleValue();
                    d2 += doubleValue * doubleValue;
                }
                i6++;
            }
            int i7 = 0;
            for (Integer num5 : treeMap.keySet()) {
                if (num5.intValue() >= i) {
                    treeMap.put(num5, new Double((((Double) treeMap.get(num5)).doubleValue() / Math.sqrt(d2)) * this.avgDocLength));
                }
                i7++;
            }
        }
        double[] dArr = new double[treeMap.size()];
        int[] iArr = new int[treeMap.size()];
        int i8 = 0;
        for (Integer num6 : treeMap.keySet()) {
            dArr[i8] = ((Double) treeMap.get(num6)).doubleValue();
            iArr[i8] = num6.intValue();
            i8++;
        }
        SparseInstance sparseInstance = new SparseInstance(instance.weight(), dArr, iArr, outputFormatPeek().numAttributes());
        sparseInstance.setDataset(outputFormatPeek());
        push(sparseInstance);
    }

    private int convertInstancewoDocNorm(Instance instance, FastVector fastVector) {
        TreeMap treeMap = new TreeMap();
        int i = 0;
        for (int i2 = 0; i2 < getInputFormat().numAttributes(); i2++) {
            if (!this.m_SelectedRange.isInRange(i2)) {
                if (getInputFormat().attribute(i2).type() != 2) {
                    if (instance.value(i2) != KStarConstants.FLOOR) {
                        treeMap.put(new Integer(i), new Double(instance.value(i2)));
                    }
                } else if (instance.isMissing(i2)) {
                    treeMap.put(new Integer(i), new Double(Instance.missingValue()));
                } else {
                    if (outputFormatPeek().attribute(i).numValues() == 0) {
                        outputFormatPeek().attribute(i).addStringValue("Hack to defeat SparseInstance bug");
                    }
                    treeMap.put(new Integer(i), new Double(outputFormatPeek().attribute(i).addStringValue(instance.stringValue(i2))));
                }
                i++;
            }
        }
        for (int i3 = 0; i3 < instance.numAttributes(); i3++) {
            if (this.m_SelectedRange.isInRange(i3) && !instance.isMissing(i3)) {
                Enumeration stringTokenizer = !this.m_onlyAlphabeticTokens ? new StringTokenizer(instance.stringValue(i3), this.delimiters) : new AlphabeticStringTokenizer(this, instance.stringValue(i3));
                while (stringTokenizer.hasMoreElements()) {
                    String str = (String) stringTokenizer.nextElement();
                    if (this.m_lowerCaseTokens) {
                        str = str.toLowerCase();
                    }
                    Integer num = (Integer) this.m_Dictionary.get(str);
                    if (num != null) {
                        if (this.m_OutputCounts) {
                            Double d = (Double) treeMap.get(num);
                            if (d != null) {
                                treeMap.put(num, new Double(d.doubleValue() + 1.0d));
                            } else {
                                treeMap.put(num, new Double(1.0d));
                            }
                        } else {
                            treeMap.put(num, new Double(1.0d));
                        }
                    }
                }
            }
        }
        if (this.m_TFTransform) {
            int i4 = 0;
            for (Integer num2 : treeMap.keySet()) {
                if (num2.intValue() >= i) {
                    treeMap.put(num2, new Double(Math.log(((Double) treeMap.get(num2)).doubleValue() + 1.0d)));
                }
                i4++;
            }
        }
        if (this.m_IDFTransform) {
            int i5 = 0;
            for (Integer num3 : treeMap.keySet()) {
                if (num3.intValue() >= i) {
                    treeMap.put(num3, new Double(((Double) treeMap.get(num3)).doubleValue() * Math.log(this.numInstances / this.docsCounts[num3.intValue()])));
                }
                i5++;
            }
        }
        double[] dArr = new double[treeMap.size()];
        int[] iArr = new int[treeMap.size()];
        int i6 = 0;
        for (Integer num4 : treeMap.keySet()) {
            dArr[i6] = ((Double) treeMap.get(num4)).doubleValue();
            iArr[i6] = num4.intValue();
            i6++;
        }
        SparseInstance sparseInstance = new SparseInstance(instance.weight(), dArr, iArr, outputFormatPeek().numAttributes());
        sparseInstance.setDataset(outputFormatPeek());
        fastVector.addElement(sparseInstance);
        return i;
    }

    public static void main(String[] strArr) {
        try {
            if (Utils.getFlag('b', strArr)) {
                Filter.batchFilterFile(new StringToWordVector(), strArr);
            } else {
                Filter.filterFile(new StringToWordVector(), strArr);
            }
        } catch (Exception e) {
            e.printStackTrace();
            System.out.println(e.getMessage());
        }
    }
}
