import { ColumnDefinition, UploadDataContainer } from "../../../types/fileUploader";
import { PiiCategory } from "../../../types/quickAnalysis";
import { RuleCandidateWithoutMeta } from "../../../types/rules";
import { sampleDeterministically } from "../../../utils/sampling";
import {
  containsSensitive,
  getRegexFromPiiCategories,
} from "../qualityRuleFunctions/sensitiveTest";

const PII_SAMPLE_SIZE = 1000;
export const ALL_PII_CATEGORIES: PiiCategory[] = [
  "Email address",
  "Social security number",
  "Passport number",
  "Credit card",
  "Phone number",
  "IP address",
  "IBAN",
  "Name",
];

export const predictSensitiveRules = ({
  columns,
  data,
  fileName,
}: UploadDataContainer): RuleCandidateWithoutMeta[] => {
  const ruleCandidates: RuleCandidateWithoutMeta[] = [];
  const regexResults = getRegexFromPiiCategories();

  columns.forEach((column) => {
    if (column.dataType !== "String") return;
    const values = data.map((row) => row[column.index].value.toString());
    const sampledValues = sampleDeterministically(values, PII_SAMPLE_SIZE);
    // For now just check if any PII category matches, if yes add all
    const matchedCategory = sampledValues.some((value) => containsSensitive(value, regexResults));
    if (matchedCategory) ruleCandidates.push(getSensitiveRule(column, fileName));
  });
  return ruleCandidates;
};

export const getSensitiveRule = (
  column: ColumnDefinition,
  fileName: string
): RuleCandidateWithoutMeta => {
  return {
    id: `${fileName}::sensitive::${column.index}`,
    columns: [column],
    dimension: "sensitive",
    severity: "warning",
    confidence: 4,
    isAccepted: true,
    name: "Sensitive",
    description: "Tests that the column does not contain any sensitive (PII) data",
    qualityTest: {
      testFunctionName: "sensitiveTest",
      meta: {
        piiCategoriesToCheck: ALL_PII_CATEGORIES,
      },
    },
    fileName,
  };
};
