Example of the aitlas toolbox for benchmarking a dataset#

This notebook shows a sample implementation of a multi class image classification using the aitlas toolbox using the UC Merced dataset.

[18]:
import pandas as pd
import matplotlib.pyplot as plt

from aitlas.datasets import UcMercedDataset
from aitlas.models import VGG16
from aitlas.tasks import StratifiedSplitTask

Define the splits and dataset paths#

[19]:
splits = [(10, 90), (20, 80), (30, 70), (40, 60), (50, 50), (60, 40), (70, 30),
          (80, 20), (90, 10)] # one tuple is (train, test)
data_dir = "/media/hdd/multi-class/UCMerced" # where is the dataset on disk

Loop through the splits, train and evaluate#

[ ]:
results = [] # results accumulator

# iterate through the splits
for train, test in splits:
    # where to store the file names of the train images
    train_csv_file = f"/media/hdd/multi-class/UCMerced/train_{train}_{test}.csv"
    # where to store the file names of the test images
    test_csv_file = f"/media/hdd/multi-class/UCMerced/test_{train}_{test}.csv"
    # configure split task
    split_config = {
        "split": {
            "train": {
                "ratio": train,
                "file": train_csv_file
            },
            "test": {
                "ratio": test,
                "file": test_csv_file
            }
        },
        "data_dir": data_dir
    }
    split_task = StratifiedSplitTask(None, split_config)
    split_task.run()

    # setup train set
    train_dataset_config = {
        "batch_size": 4,
        "shuffle": True,
        "num_workers": 4,
        "csv_file": train_csv_file,
        "data_dir": data_dir,
        "transforms": ["aitlas.transforms.ResizeCenterCropFlipHVToTensor"]
    }

    train_dataset = UcMercedDataset(train_dataset_config)

    # setup test set
    test_dataset_config = {
        "batch_size": 4,
        "shuffle": False,
        "num_workers": 4,
        "csv_file": test_csv_file,
        "data_dir": data_dir,
        "transforms": ["aitlas.transforms.ResizeCenterCropToTensor"]
    }

    test_dataset = UcMercedDataset(test_dataset_config)
    print(f"Train size: {len(train_dataset)}, Test size: {len(test_dataset)}")


    # setup model
    epochs = 50
    model_directory = "./experiments/uc_merced/"
    model_config = {
        "num_classes": 21,
        "learning_rate": 0.0001,
        "pretrained": True
    }
    model = VGG16(model_config)
    model.prepare()

    # training and evaluation
    model.train_and_evaluate_model(
        train_dataset=train_dataset,
        epochs=epochs,
        model_directory=model_directory,
        val_dataset=test_dataset,
        run_id='1',
    )

    # collect results
    results.append(model.running_metrics.f1_score())

See the results#

[21]:
df = pd.DataFrame(zip(splits, [round(float(r["F1_score Micro"]), 4) for r in results]),
                  columns=["Train/Test", "Micro F1 score"])
df
[21]:
Train/Test Micro F1 score
0 (10, 90) 0.7450
1 (20, 80) 0.7107
2 (30, 70) 0.8741
3 (40, 60) 0.8619
4 (50, 50) 0.9181
5 (60, 40) 0.8857
6 (70, 30) 0.9032
7 (80, 20) 0.8405
8 (90, 10) 0.9143
[22]:
df.plot(x='Train/Test', y='Micro F1 score', kind = 'line')
plt.show()
../_images/examples_land_use_classification_benchmark_8_0.png
[ ]: