Evaluation d’un modèle

Connexion à l’API et ajout des données

viewof username = Inputs.password({label: html`<b>Set username</b>`, value: ""})

viewof password = Inputs.password({label: html`<b>Set password</b>`, value: ""})

viewof csvfile = Inputs.file({
  label: md`__Observations to evaluate__ 📁 ([Example here](https://raw.githubusercontent.com/ThomasFaria/codif-ape-API/main/website/exemples/exemple.csv))`,
  accept: ".csv",
  required: true
})

Distribution des indices de confiances

grid = await html`<div style="
              background: #fff;
              margin: 0;
              border: none ;
              display: grid;
              width: ${screen.width};
              grid-template-areas: 
                'a b c'
                'd d d'
                'e e e'
                ;
              grid-gap: 10px;
            ">
              <div name="a" style="grid-area: a; position: relative;">${viewof quantile}</div>
              <div name="b" style="grid-area: b; position: relative;"><div class="textstyled">Accuracy: ${Accuracy}</div></div>
              <div name="c" style="grid-area: c; position: relative;"><div class="textstyled">Seuil: ${Seuil}</div></div>
              <div name="d" style="grid-area: d; position: relative;">${distrib}</div>
              <div name="e" style="grid-area: e; position: relative;">${distrib_adjusted}</div>
            </div>`

Analyse des erreurs

Plot.barY(count_errors.slice(0, 15), {
  x: "value",
  y: "frequency",
  sort: { x: "-y" },
  tip: true
}).plot()

viewof search = Inputs.search(subset.filter((d) => 1 - d.Result))

WrongPredictions = Inputs.table(search)

Impact du seuil de reprise sur la performance et le taux

grid2 = await html`<div style="
              background: #fff;
              margin: 0;
              border: none ;
              display: grid;
              width: ${screen.width};
              grid-template-areas: 
                'a'
                'b'
                'c'
                ;
              grid-gap: 10px;
            ">
              <div name="a" style="grid-area: a; position: relative;">${codif_seuil}</div>
              <div name="b" style="grid-area: b; position: relative;">${acc_seuil}</div>
              <div name="c" style="grid-area: c; position: relative;">${acc_codif}</div>
            </div>`

viewof quantile = Inputs.range([0, 100], {label: "Taux de classification automatique", step: 1, value: 80})

distrib = Plot.plot({
  round: true,
  width: screen.width * 0.7,
  color: { scheme: "BuRd", legend: true },
  marks: [
    Plot.rectY(
      processed_response,
      Plot.binX(
        { y: "count" },
        { x: "IC", fill: "Result", mixBlendMode: "multiply", tip: true }
      )
    ),
    Plot.ruleY([0]),
    Plot.ruleX([Seuil], { strokeWidth: 2, tip: true })
  ]
})

distrib_adjusted = Plot.plot({
  width: screen.width * 0.7,
  y: { percent: true },
  color: { scheme: "BuRd", legend: true },
  marks: [
    Plot.rectY(
      processed_response,
      Plot.normalizeY(
        "sum", // normalize each series by the sum per series
        Plot.binX(
          { y2: "count" }, // disable implicit stack transform
          { x: "IC", fill: "Result", mixBlendMode: "multiply", tip: true }
        )
      )
    ),
    Plot.ruleY([0]),
    Plot.ruleX([Seuil], { strokeWidth: 2, tip: true })
  ]
})

Seuil = round(
  d3.quantile(
    processed_response.map((d) => d.IC),
    1 - quantile / 100
  ),
  2
)

Accuracy = round(d3.mean(subset.map((d) => d.Result)) * 100, 2)

codif_seuil = Plot.plot({
  grid: true,
  y: {
    label: "↑ Taux de classification automatique (%)",
    percent: true
  },
  x: {
    label: "→ Seuil choisi"
  },
  marks: [
    Plot.line(ic_tx_classif, {
      x: "seuils",
      y: "codif_auto",
      stroke: "black",
      strokeWidth: 2
    }),
    Plot.tip(
      ic_tx_classif,
      Plot.pointer({
        x: "seuils",
        y: "codif_auto",
        title: (d) => `Accuracy : ${round(d.accuracies, 2)}\nSeuil : ${round(
          d.seuils,
          2
        )}\nTaux classification automatique: 
${round(d.codif_auto * 100, 2)} %`
      })
    ),
    Plot.tip([`Seuil à 0.6`], {
      x: ic_tx_classif[60].seuils,
      y: ic_tx_classif[60].codif_auto,
      anchor: "bottom"
    })
  ],
  color: { legend: true }
})

acc_seuil = Plot.plot({
  grid: true,
  y: {
    label: "↑ Accuracy (%)",
    percent: true
  },
  x: {
    label: "→ Seuil choisi"
  },
  marks: [
    Plot.line(ic_tx_classif, {
      x: "seuils",
      y: "accuracies",
      stroke: "black",
      strokeWidth: 2
    }),
    Plot.tip(
      ic_tx_classif,
      Plot.pointer({
        x: "seuils",
        y: "accuracies",
        title: (d) => `Accuracy : ${round(d.accuracies, 2)}\nSeuil : ${round(
          d.seuils,
          2
        )}\nTaux classification automatique: 
${round(d.codif_auto * 100, 2)} %`
      })
    ),
    Plot.tip([`Seuil à 0.6`], {
      x: ic_tx_classif[60].seuils,
      y: ic_tx_classif[60].accuracies,
      anchor: "bottom"
    })
  ],
  color: { legend: true }
})

acc_codif = Plot.plot({
  grid: true,
  y: {
    label: "↑ Accuracy (%)",
    percent: true
  },
  x: {
    label: "→ Taux de classification automatique (%)",
    percent: true
  },
  marks: [
    Plot.line(ic_tx_classif, {
      x: "codif_auto",
      y: "accuracies",
      stroke: "black",
      strokeWidth: 2
    }),
    Plot.tip(
      ic_tx_classif,
      Plot.pointer({
        x: "codif_auto",
        y: "accuracies",
        title: (d) => `Accuracy : ${round(d.accuracies, 2)}\nSeuil : ${round(
          d.seuils,
          2
        )}\nTaux classification automatique: 
${round(d.codif_auto * 100, 2)} %`
      })
    ),
    Plot.tip([`Seuil à 0.6`], {
      x: ic_tx_classif[60].codif_auto,
      y: ic_tx_classif[60].accuracies,
      anchor: "bottom"
    })
  ],
  color: { legend: true }
})

ic_tx_classif = {
  const newArray = [];
  for (let i = 0; i <= 1; i += 0.01) {
    let seuil = round(i, 2);
    newArray.push({
      seuils: seuil,
      codif_auto:
        processed_response.filter((d) => d.IC > seuil).length /
        processed_response.length,
      accuracies: get_accuracies_from_threshold(processed_response, i, {
        round_int: 15
      })
    });
  }
  return newArray;
}

data = db.query(
  `
SELECT text_description, type_, nature, surface, event, code
FROM liasses
`
)

transformedData = data.reduce((acc, obj) => {
  for (const [key, value] of Object.entries(obj)) {
    if (!acc.hasOwnProperty(key)) {
      acc[key] = [];
    }
    acc[key].push(value);
  }
  return acc;
}, {})

db = DuckDBClient.of({
  liasses: await csvfile.csv()
})

processed_response = Object.keys(response.IC).map((key) => ({
  IC: response.IC[key] == 1 ? response.IC[key] - 1e-16 : response.IC[key],
  Probability: response.Probability[key],
  Prediction: response.Prediction[key],
  Code: response.Code[key],
  Result: response.Result[key],
  Lib: response.Lib[key]
}))

response = queryBatchApi(username, password, data)

subset = processed_response.filter((d) => d.IC > Seuil)

quantiles = Array.from({ length: 101 }, (_, i) => i/100) // Array with quantile values from 0 to 1

seuils = quantiles.map((quantile) =>
  d3.quantile(
    processed_response.map((d) => d.IC),
    1 - quantile
  )
)

count_errors = valueCounts(
  subset.filter((d) => 1 - d.Result).map((d) => d.Code)
)

async function queryBatchApi(username, password, data) {
  const url = "https://codification-ape.lab.sspcloud.fr/evaluation";

  // Create the request body
  const request_body = data.reduce((acc, obj) => {
    for (const [key, value] of Object.entries(obj)) {
      if (!acc.hasOwnProperty(key)) {
        acc[key] = [];
      }
      acc[key].push(value);
    }
    return acc;
  }, {});

  const response = await fetch(url, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      Authorization: `Basic ${btoa(`${username}:${password}`)}`
    },
    body: JSON.stringify(request_body)
  });

  if (response.ok) {
    return response.json();
  } else if (response.status === 400) {
    console.log((await response.json()).detail);
  } else {
    console.log("Error occurred while querying the API.");
    return response;
  }
}

function get_accuracies_from_threshold(
  data,
  threshold,
  { round_int = 2 } = {}
) {
  let acc = d3.mean(data.filter((d) => d.IC > threshold).map((d) => d.Result));
  acc = acc === undefined ? 1 : round(acc, round_int);
  return acc;
}

function round(d, int) {
  return Math.round(d * 10 ** int) / 10 ** int;
}

function valueCounts(array, { freq = false } = {}) {
  const counts = {};

  for (let i = 0; i < array.length; i++) {
    const value = array[i];

    if (counts[value]) {
      counts[value]++;
    } else {
      counts[value] = 1;
    }
  }

  const result = [];

  const arrayLength = array.length;
  for (const value in counts) {
    const frequency = freq ? counts[value] / arrayLength : counts[value];
    result.push({ value: value, frequency: frequency });
  }
  result.sort((a, b) => b.frequency - a.frequency); // Sort by frequency in descending order

  return result;
}

Plot = require("https://cdn.jsdelivr.net/npm/@observablehq/plot@0.6.8/dist/plot.umd.min.js")