"""
.. currentmodule:: neuralk

.. _example_housing_classification:

Using the Neuralk Seldon Model
==============================

The :class:`Seldon` model is the simplest way to use Neuralk's In-Context
Learning model for classification and regression. It offers the usual scikit-learn
interface so it can easily be inserted into any machine-learning pipeline.

Seldon automatically detects whether the task is classification or regression
based on the target variable provided during ``fit()``.

.. note::

    For this example to run, the environment variable ``API_KEY`` must be set
    with your Neuralk API key.
"""

# %%
# Simple example on toy data
# --------------------------
#
# We start by using Seldon on simple data that needs no preprocessing.
#
# Generate simple data:

# %%
import os
import warnings

import skrub
from sklearn.datasets import make_classification
from sklearn.decomposition import PCA
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.pipeline import make_pipeline

from neuralk import Seldon, SeldonClassifier, datasets

try:
    skrub.set_config(use_table_report=False)
except TypeError:
    pass

# API key for Neuralk cloud service
API_KEY = os.environ["API_KEY"]

X, y = make_classification()
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(f"{X_train.shape=} {y_train.shape=} {X_test.shape=} {y_test.shape=}")

# %%
# Now we apply Neuralk's model.

# %%

# Note: nothing actually happens during fit() -- in-context learning models are
# pretrained but require no fitting on our specific dataset.
model = Seldon(api_key=API_KEY).fit(X_train, y_train)

predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

# %%
# Working with non-numeric data
# -----------------------------
#
# The Neuralk Seldon model is a raw estimator that does not perform any
# preprocessing. To handle complex datasets, we need to encode non-numeric data
# and possibly reduce the feature dimension. The example below shows a simple
# pipeline that yields good results for most datasets.
#
# The example dataset contains the descriptions and sale price of houses. The
# prediction target is the sale price (binned to transform it into a
# classification task).

# %%

X, y = datasets.housing()

X.assign(Sale_Price=y).iloc[:, :4].head()

# %%
# As we can see above, the dataset contains many columns of different types.
# The basic Neuralk service only accepts numeric data.
# Moreover, it is better to send data that is not too high-dimensional
# otherwise the model is forced to subsample the context and this can
# deteriorate performance.
#
# To meet those requirements, we build a simple pipeline that transforms the
# input to a numeric array with the :class:`skrub.TableVectorizer`, then scales features, imputes
# missing values and reduces dimension with a Principal Components Analysis.
# Note that the :class:`neuralk.Seldon` universal predictor is not compatible
# with the scikit-learn estimator interface so we replace it by a
# :class:`neuralk.SeldonClassifier`.
#
# .. note::
#
#    Here we perform dimensionality reduction to control the number of columns.
#    If the dataset also has a very large number of rows, the model will
#    subsample the training data to be able to fit it in memory and make a
#    prediction. This is done by default and you do not need to do anything to
#    activate it. However, if you have a way to select a better training
#    subsample than the default random one, you may want to do the subsampling
#    yourself.
#
# We start by defining the pipeline:

# %%

model = make_pipeline(
    skrub.TableVectorizer(),
    skrub.SquashingScaler(),
    SimpleImputer(),
    PCA(40),
    SeldonClassifier(api_key=API_KEY),
)

# %%
# And now we can evaluate its performance.

# %%

# Silence spurious warning from scikit-learn while preprocessing some categorical columns.
warnings.filterwarnings("ignore", message="Found unknown categories.*during transform")

cv_results = cross_validate(model, X, y, error_score="raise", scoring="accuracy")
cv_results["test_score"]

# %%
# For comparison, we can run the same experiment after replacing the in-context
# learner with gradient boosting:

# %%

model = make_pipeline(
    skrub.TableVectorizer(),
    skrub.SquashingScaler(),
    SimpleImputer(),
    # PCA(40), # The PCA makes results worse for the gradient boosting
    HistGradientBoostingClassifier(),
)

cv_results = cross_validate(model, X, y, error_score="raise", scoring="accuracy")
cv_results["test_score"]