#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Benjamin Vial
# This file is part of nannos
# License: GPLv3
# See the documentation at nannos.gitlab.io

"""
Backends comparison
===================

Numerical backends performace comparison.
"""


import matplotlib.pyplot as plt
import numpy as np

markers = ["o", "s", "d", "v", "^", ">"]
figsize = (2, 2)
threads = [1, 2, 4, 8, 16]
devices = ["cpu", "gpu"]
# backends = ["numpy", "scipy", "autograd", "jax", "torch"]
# we skip jax as it is complicated to deal with multithreading so a fair comparison is impossible
backends = ["numpy", "scipy", "autograd", "torch"]

colors = ["#3b9dd4", "#ecd142", "#e87c40", "#b33dd1", "#50ba61", "#cd2323"]


##############################################################################
# Time vs. number of harmonics
# ------------------------------


for num_threads in threads:
    plt.figure(figsize=figsize)
    i = 0
    for backend in backends:
        for device in devices:
            g = "cuda" if device == "gpu" else device
            if device != "gpu" or backend not in [
                "numpy",
                "scipy",
                "autograd",
                "jax",
            ]:
                arch = np.load(
                    f"{num_threads}/benchmark_{backend}_{g}.npz", allow_pickle=True
                )
                NH = arch["real_nh"]
                plt.plot(
                    arch["real_nh"],
                    arch["times"],
                    f"-{markers[i]}",
                    c=colors[i],
                    label=f"{backend} {device}",
                )

                times_all = np.array(arch["times_all"])
                times_std = np.std(times_all, axis=1)
                plt.errorbar(
                    arch["real_nh"],
                    arch["times"],
                    times_std,
                    c=colors[i],
                    capsize=1,
                )

                i += 1
    plt.legend()
    plt.yscale("log")
    plt.xscale("log")
    plt.xlabel("number of harmonics")
    plt.ylabel("time (s)")
    plt.title(f"backends comparison {num_threads} threads")
    plt.tight_layout()


##############################################################################
# Speedup vs. number of harmonics
# -----------------------------------

for num_threads in threads:
    plt.figure(figsize=figsize)
    arch_np = np.load(f"{num_threads}/benchmark_numpy_cpu.npz", allow_pickle=True)

    i = 1
    for backend in backends:
        for device in devices:
            g = "cuda" if device == "gpu" else device
            if device != "gpu" or backend == "torch":
                arch = np.load(
                    f"{num_threads}/benchmark_{backend}_{g}.npz", allow_pickle=True
                )
                if backend != "numpy":
                    speedup = np.array(arch_np["times"]) / np.array(arch["times"])
                    plt.plot(
                        arch["real_nh"],
                        speedup,
                        f"-{markers[i]}",
                        c=colors[i],
                        label=f"{backend} {device}",
                    )

                    speedup_all = np.array(arch_np["times_all"]) / np.array(
                        arch["times_all"]
                    )
                    speedup_std = np.std(speedup_all, axis=1)
                    plt.errorbar(
                        arch["real_nh"],
                        speedup,
                        speedup_std,
                        c=colors[i],
                        capsize=1,
                    )
                    i += 1
    plt.legend()
    # plt.yscale("log")
    # plt.xscale("log")
    plt.xlabel("number of harmonics")
    plt.ylabel("speedup vs. numpy")
    plt.title(f"backends comparison {num_threads} threads")
    plt.tight_layout()


##############################################################################
# Time vs. number of threads
# -----------------------------


for inh in range(len(NH)):
    plt.figure(figsize=figsize)
    i = 0
    for backend in backends:
        for device in devices:
            t_threads = []
            t_threads_all = []
            for num_threads in threads:
                if device != "gpu" or backend == "torch":
                    g = "cuda" if device == "gpu" else device
                    arch = np.load(
                        f"{num_threads}/benchmark_{backend}_{g}.npz", allow_pickle=True
                    )
                    t = arch["times"]
                    # t = np.array(t)
                    t_threads.append(t)
                    t_threads_all.append(arch["times_all"])
            if t_threads != []:
                t_threads = np.array(t_threads)
                plt.plot(
                    threads,
                    t_threads[:, inh],
                    f"-{markers[i]}",
                    c=colors[i],
                    label=f"{backend} {device}",
                )
                times_all = np.array(t_threads_all)[:, inh]
                times_std = np.std(times_all, axis=1)
                plt.errorbar(
                    threads,
                    t_threads[:, inh],
                    times_std,
                    c=colors[i],
                    capsize=1,
                )
                i += 1
    plt.xticks(threads)

    plt.legend(ncol=2)
    plt.yscale("log")
    # plt.xscale("log")
    plt.xlabel("number of threads")
    plt.ylabel("time (s)")
    plt.title(f"backends comparison {NH[inh]} harmonics")
    plt.tight_layout()


##############################################################################
# Speedup vs. number of threads
# -----------------------------


for inh in range(len(NH)):
    plt.figure(figsize=figsize)
    i = 1
    for backend in backends:
        for device in devices:
            speedup_threads = []
            speedup_threads_all = []
            for num_threads in threads:
                if device != "gpu" or backend == "torch":
                    g = "cuda" if device == "gpu" else device
                    arch = np.load(
                        f"{num_threads}/benchmark_{backend}_{g}.npz", allow_pickle=True
                    )

                    arch_np = np.load(
                        f"{num_threads}/benchmark_numpy_cpu.npz", allow_pickle=True
                    )
                    if backend != "numpy":
                        t = arch["times"]
                        speedup = np.array(arch_np["times"]) / np.array(arch["times"])
                        speedup_threads.append(speedup)
                        speedup_all = np.array(arch_np["times_all"]) / np.array(
                            arch["times_all"]
                        )
                        speedup_threads_all.append(speedup_all)
            if speedup_threads != []:
                speedup_threads = np.array(speedup_threads)
                if backend != "numpy":
                    plt.plot(
                        threads,
                        speedup_threads[:, inh],
                        f"-{markers[i]}",
                        c=colors[i],
                        label=f"{backend} {device}",
                    )

                    speedup_std = np.std(np.array(speedup_threads_all)[:, inh], axis=1)
                    plt.errorbar(
                        threads,
                        speedup_threads[:, inh],
                        speedup_std,
                        c=colors[i],
                        capsize=1,
                    )
                    i += 1
    plt.xticks(threads)
    # plt.ylim(0.25, 3.8)

    plt.legend(ncol=2)
    # plt.yscale("log")
    # plt.xscale("log")
    plt.xlabel("number of threads")
    plt.ylabel("speedup vs. numpy")
    plt.title(f"backends comparison {NH[inh]} harmonics")
    plt.tight_layout()