package de.uni_mannheim.informatik.dws.winter.webtables;

import de.uni_mannheim.informatik.dws.winter.webtables.parsers.StringNormalizer;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.LowerCaseFilter;
import org.apache.lucene.analysis.core.StopFilter;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.miscellaneous.WordDelimiterFilterFactory;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
import org.apache.lucene.util.Version;

/* loaded from: input_file:de/uni_mannheim/informatik/dws/winter/webtables/WebTablesStringNormalizer.class */
public class WebTablesStringNormalizer {
    public static String nullValue = StringNormalizer.nullValue;
    public static List<String> stopWords = new ArrayList();
    private static final List<String> possibleNullValues = new ArrayList<String>() { // from class: de.uni_mannheim.informatik.dws.winter.webtables.WebTablesStringNormalizer.1
        private static final long serialVersionUID = 1;

        {
            add("");
            add("__");
            add("-");
            add("_");
            add("?");
            add("unknown");
            add("- -");
            add("n/a");
            add("•");
            add("- - -");
            add(".");
            add("??");
            add("(n/a)");
        }
    };
    private static final Pattern bracketsPattern = Pattern.compile("\\(.*\\)");

    public static String normaliseHeader(String str) {
        String replaceAll = StringEscapeUtils.unescapeJava(str).replace("\"", "").replace("|", " ").replace(",", "").replace("{", "").replace("}", "").replaceAll("\n", "").replace("&nbsp;", " ").replace("&nbsp", " ").replace("nbsp", " ").replaceAll("<.*>", "").toLowerCase().trim().replaceAll("\\.", "").replaceAll("\\$", "");
        if (possibleNullValues.contains(replaceAll)) {
            replaceAll = nullValue;
        }
        return replaceAll;
    }

    public static String normaliseValue(String str, boolean z) {
        try {
            str = str.replaceAll("\n", "").replace("&nbsp;", " ").replace("&nbsp", " ").replaceAll("[&\\?]#[0-9]{1,3};", "").replace("nbsp", " ").replaceAll("<.*>", "").toLowerCase().trim();
            if (possibleNullValues.contains(str)) {
                str = nullValue;
            }
            if (z) {
                str = bracketsPattern.matcher(str).replaceAll("");
            }
        } catch (Exception e) {
        }
        return str;
    }

    public static String normalise(String str, boolean z) {
        return StringUtils.join(tokenise(str, z), " ");
    }

    public static List<String> tokenise(String str, boolean z) {
        StopwordAnalyzerBase standardAnalyzer = new StandardAnalyzer(Version.LUCENE_46);
        ArrayList arrayList = new ArrayList();
        try {
            HashMap hashMap = new HashMap();
            hashMap.put("generateWordParts", "1");
            hashMap.put("generateNumberParts", "1");
            hashMap.put("catenateNumbers", "0");
            hashMap.put("splitOnCaseChange", "1");
            TokenStream create = new WordDelimiterFilterFactory(hashMap).create(new WhitespaceTokenizer(Version.LUCENE_46, new StringReader(StringEscapeUtils.unescapeJava(str).replaceAll("[\\(\\)]", ""))));
            create.reset();
            if (z) {
                create = new PorterStemFilter(create);
            }
            StopFilter stopFilter = new StopFilter(Version.LUCENE_46, new LowerCaseFilter(Version.LUCENE_46, create), standardAnalyzer.getStopwordSet());
            while (stopFilter.incrementToken()) {
                arrayList.add(stopFilter.getAttribute(CharTermAttribute.class).toString());
            }
            stopFilter.close();
        } catch (IOException e) {
        }
        standardAnalyzer.close();
        return arrayList;
    }
}
