import { mean, standardDeviation } from "simple-statistics";
import { ColumnDefinition, UploadDataContainer } from "../../../types/fileUploader";
import { Category, RuleCandidateWithoutMeta } from "../../../types/rules";
import { getFrequencyMap } from "../../../utils/strings";

export const predictCategoricalRules = (
  dataContainer: UploadDataContainer
): RuleCandidateWithoutMeta[] => {
  const ruleCandidates: RuleCandidateWithoutMeta[] = [];
  const { columns } = dataContainer;
  columns.forEach((column) => {
    const { isCategoricalData } = column;
    if (!isCategoricalData) return;
    ruleCandidates.push(getCategoricalRule(column, dataContainer));
  });

  return ruleCandidates;
};

const STD_DEVS_CUTOFF_FOR_CATEGORICAL = 3;
export const getCategoricalRule = (
  column: ColumnDefinition,
  dataContainer: UploadDataContainer
): RuleCandidateWithoutMeta => {
  const values = dataContainer.data.map((row) => row[column.index].value.toString());
  const frequencyMap = getFrequencyMap(values);
  const frequencies: number[] = Object.values(frequencyMap);
  const MEAN = mean(frequencies);
  const SD = standardDeviation(frequencies);
  const min = MEAN - STD_DEVS_CUTOFF_FOR_CATEGORICAL * SD;
  const categories: Category[] = Object.entries(frequencyMap).map(([value, frequency]) => {
    const isLegal = frequency > min;
    return { value, isLegal, frequency };
  });
  return {
    id: `${dataContainer.fileName}::categorical_check::${column.index}`,
    columns: [column],
    dimension: "inconsistent",
    severity: "info",
    confidence: 3,
    isAccepted: true,
    name: "Categorical data",
    description: "Ensures values are legal categorical values",
    qualityTest: {
      testFunctionName: "categoricalData",
      meta: {
        legalCategoricalValues: categories,
      },
    },
    fileName: dataContainer.fileName,
  };
};
