import { ColumnDefinition, UploadDataContainer } from "../../../types/fileUploader";
import { RuleCandidateWithoutMeta } from "../../../types/rules";
import { getMostFrequentTuple } from "../../../utils/strings";
import { EMPTY_CELL_VALUES } from "../../quickAnalysis/qualityChecks/constants";
import { getPatternFromString, getRegexFromPattern } from "../qualityRuleFunctions/strPatterns";

const DEFAULT_MINIMUM_FREQUENCY = 0.8;
/**
 * This function predicts a rule candidate for the string pattern rule.
 * It checks if the most frequent string in the column matches a certain pattern.
 * If the frequency is higher than 50% and the pattern is not undefined, a rule candidate is returned.
 * @param values
 * @param ruleIndex
 * @param column
 * @param fileName
 * @returns
 */
export const predictStrPatterns = (
  dataContainer: UploadDataContainer
): RuleCandidateWithoutMeta[] => {
  const ruleCandidates: RuleCandidateWithoutMeta[] = [];
  dataContainer.columns.forEach((column) => {
    const rule = getStrPatternRule(column, dataContainer);
    rule && ruleCandidates.push(rule);
  });
  return ruleCandidates;
};

export const getStrPatternRule = (
  column: ColumnDefinition,
  dataContainer: UploadDataContainer,
  minimumFrequency: number = DEFAULT_MINIMUM_FREQUENCY
): RuleCandidateWithoutMeta | undefined => {
  const { dataType, index } = column;
  if (dataType !== "String") return undefined;
  const valuesAsStrings = dataContainer.data
    .map((row) => row[index].value.toString())
    .filter((value) => !EMPTY_CELL_VALUES.includes(value));
  const [mostFrequent, frequency] = getMostFrequentTuple(
    valuesAsStrings.map((s) => getPatternFromString(s))
  );
  if (frequency / valuesAsStrings.length < minimumFrequency) return;
  const patternGenerated = getRegexFromPattern(mostFrequent);
  return {
    id: `${dataContainer.fileName}::outlier::strPatterns::${column.index}`,
    columns: [column],
    dimension: "inconsistent",
    severity: "warning",
    confidence: 4,
    isAccepted: true,
    name: "String Pattern",
    description: `Test if entries follow the most frequent string pattern in this column.`,
    qualityTest: {
      testFunctionName: "strPatterns",
      meta: {
        patternRegex: patternGenerated.toString().slice(1, -1),
      },
    },
    fileName: dataContainer.fileName,
  };
};
