import { sampleCorrelation } from "simple-statistics";
import { ColumnDefinition, UploadDataContainer } from "../../../types/fileUploader";
import { RuleCandidateWithoutMeta } from "../../../types/rules";
import { sampleDeterministically } from "../../../utils/sampling";
import { getAllPossiblePairs } from "../../../utils/combinatorics";
import { getNumericCellsForTwoColumns } from "../getNumericCells";

const MAX_SAMPLE_SIZE = 100;
const CORRELATION_CUTOFF_POINT = 0.7;
const STRONG_CORRELATION_CUTOFF_POINT = 0.8;

type ColumnCorrelation = {
  column1: ColumnDefinition;
  column2: ColumnDefinition;
  correlation: number;
};

export const predictColumnCorrelations = (
  dataContainer: UploadDataContainer
): RuleCandidateWithoutMeta[] => {
  const sampledData = sampleDeterministically(
    dataContainer.data,
    Math.min(dataContainer.data.length, MAX_SAMPLE_SIZE)
  );
  const numericColumns = dataContainer.columns.filter(
    (column) => column.dataType === "Double" || column.dataType === "Integer"
  );

  const columnCorrelations = getColumnCorrelations(
    { ...dataContainer, data: sampledData },
    numericColumns
  );
  return columnCorrelations.map((columnCorrelation) => {
    return createColumnCorrelationRule(columnCorrelation, dataContainer);
  });
};

export const createColumnCorrelationRule = (
  { column1, column2, correlation }: ColumnCorrelation,
  dataContainer: UploadDataContainer
): RuleCandidateWithoutMeta => {
  const isStrongCorrelation = correlation > STRONG_CORRELATION_CUTOFF_POINT;
  return {
    id: `${dataContainer.fileName}::outlier::column_correlation::${column1.index}::${column2.index}`,
    columns: [column1, column2],
    dimension: "outlier",
    severity: isStrongCorrelation ? "warning" : "info",
    confidence: isStrongCorrelation ? 5 : 4,
    isAccepted: true,
    name: `Correlation`,
    description: `Test if entries in column ${column1.name} and ${column2.name} are correlated.`,
    qualityTest: {
      testFunctionName: "columnCorrelation",
      meta: {
        correlation,
      },
    },
    fileName: dataContainer.fileName,
  };
};

const MIN_NON_EMPTY_OR_INCONSISTENT_RATIO = 0.8;

const getColumnCorrelations = (
  dataContainer: UploadDataContainer,
  numericColumnIndexes: ColumnDefinition[]
) => {
  const columnCorrelations: ColumnCorrelation[] = [];

  getAllPossiblePairs(numericColumnIndexes).forEach(({ item1: column1, item2: column2 }) => {
    const numericCells = getNumericCellsForTwoColumns(
      dataContainer.data,
      column1.index,
      column2.index
    );
    const notEmptyRatio = numericCells.length / dataContainer.data.length;
    if (notEmptyRatio < MIN_NON_EMPTY_OR_INCONSISTENT_RATIO) return;
    const values1 = numericCells.map(({ cell1 }) => cell1.value);
    const values2 = numericCells.map(({ cell2 }) => cell2.value);
    const correlation: number = sampleCorrelation(values1, values2);
    if (correlation && correlation < CORRELATION_CUTOFF_POINT) return;
    columnCorrelations.push({
      column1,
      column2,
      correlation,
    });
  });
  return columnCorrelations;
};
