Performance Debugging#

The simple solver with performance debugging example.

Kind: logging
Builds on: simple-solver-logging, minimal-cuda-solver
Upstream source: examples/performance-debugging/performance-debugging.cpp in the Ginkgo repository.

Introduction#

This example runs a solver on a test problem and shows how to use loggers to debug performance and convergence rate.

The commented program#

#include <algorithm>
#include <array>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <ostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>

#include <ginkgo/ginkgo.hpp>


template <typename ValueType>
using vec = gko::matrix::Dense<ValueType>;


template <typename ValueType>
using real_vec = gko::matrix::Dense<gko::remove_complex<ValueType>>;


namespace utils {

creates a zero vector

template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
    std::shared_ptr<const gko::Executor> exec, gko::size_type size,
    ValueType value)
{
    auto res = vec<ValueType>::create(exec);
    res->read(gko::matrix_data<ValueType>(gko::dim<2>{size, 1}, value));
    return res;
}

utilities for computing norms and residuals

template <typename ValueType>
ValueType get_first_element(const vec<ValueType>* norm)
{
    return norm->get_executor()->copy_val_to_host(norm->get_const_values());
}


template <typename ValueType>
gko::remove_complex<ValueType> compute_norm(const vec<ValueType>* b)
{
    auto exec = b->get_executor();
    auto b_norm = gko::initialize<real_vec<ValueType>>({0.0}, exec);
    b->compute_norm2(b_norm);
    return get_first_element(b_norm.get());
}


template <typename ValueType>
gko::remove_complex<ValueType> compute_residual_norm(
    const gko::LinOp* system_matrix, const vec<ValueType>* b,
    const vec<ValueType>* x)
{
    auto exec = system_matrix->get_executor();
    auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
    auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
    auto res = gko::clone(b);
    system_matrix->apply(one, x, neg_one, res);
    return compute_norm(res.get());
}


}  // namespace utils


namespace loggers {

A logger that accumulates the time of all operations. For each operation type (allocations, free, copy, internal operations i.e. kernels), the timing is taken before and after. This can create significant overhead since to ensure proper timings, calls to synchronize are required.

struct OperationLogger : gko::log::Logger {
    void on_allocation_started(const gko::Executor* exec,
                               const gko::size_type&) const override
    {
        this->start_operation(exec, "allocate");
    }

    void on_allocation_completed(const gko::Executor* exec,
                                 const gko::size_type&,
                                 const gko::uintptr&) const override
    {
        this->end_operation(exec, "allocate");
    }

    void on_free_started(const gko::Executor* exec,
                         const gko::uintptr&) const override
    {
        this->start_operation(exec, "free");
    }

    void on_free_completed(const gko::Executor* exec,
                           const gko::uintptr&) const override
    {
        this->end_operation(exec, "free");
    }

    void on_copy_started(const gko::Executor* from, const gko::Executor* to,
                         const gko::uintptr&, const gko::uintptr&,
                         const gko::size_type&) const override
    {
        from->synchronize();
        this->start_operation(to, "copy");
    }

    void on_copy_completed(const gko::Executor* from, const gko::Executor* to,
                           const gko::uintptr&, const gko::uintptr&,
                           const gko::size_type&) const override
    {
        from->synchronize();
        this->end_operation(to, "copy");
    }

    void on_operation_launched(const gko::Executor* exec,
                               const gko::Operation* op) const override
    {
        this->start_operation(exec, op->get_name());
    }

    void on_operation_completed(const gko::Executor* exec,
                                const gko::Operation* op) const override
    {
        this->end_operation(exec, op->get_name());
    }

    void write_data(std::ostream& ostream)
    {
        for (const auto& entry : total) {
            ostream << "\t" << entry.first.c_str() << ": "
                    << std::chrono::duration_cast<std::chrono::nanoseconds>(
                           entry.second)
                           .count()
                    << std::endl;
        }
    }

private:

Helper which synchronizes and starts the time before every operation.

    void start_operation(const gko::Executor* exec,
                         const std::string& name) const
    {
        nested.emplace_back(0);
        exec->synchronize();
        start[name] = std::chrono::steady_clock::now();
    }

Helper to compute the end time and store the operation’s time at its end. Also time nested operations.

    void end_operation(const gko::Executor* exec, const std::string& name) const
    {
        exec->synchronize();
        const auto end = std::chrono::steady_clock::now();
        const auto diff = end - start[name];

make sure timings for nested operations are not counted twice

        total[name] += diff - nested.back();
        nested.pop_back();
        if (nested.size() > 0) {
            nested.back() += diff;
        }
    }

    mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
    mutable std::map<std::string, std::chrono::steady_clock::duration> total;

the position i of this vector holds the total time spend on child operations on nesting level i

    mutable std::vector<std::chrono::steady_clock::duration> nested;
};

This logger tracks the persistently allocated data

struct StorageLogger : gko::log::Logger {

Store amount of bytes allocated on every allocation

    void on_allocation_completed(const gko::Executor*,
                                 const gko::size_type& num_bytes,
                                 const gko::uintptr& location) const override
    {
        storage[location] = num_bytes;
    }

Reset the amount of bytes on every free

    void on_free_completed(const gko::Executor*,
                           const gko::uintptr& location) const override
    {
        storage[location] = 0;
    }

Write the data after summing the total from all allocations

    void write_data(std::ostream& ostream)
    {
        gko::size_type total{};
        for (const auto& e : storage) {
            total += e.second;
        }
        ostream << "Storage: " << total << std::endl;
    }

private:
    mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};

Logs true and recurrent residuals of the solver

template <typename ValueType>
struct ResidualLogger : gko::log::Logger {

Depending on the available information, store the norm or compute it from the residual. If the true residual norm could not be computed, store the value -1.0.

    void on_iteration_complete(const gko::LinOp*, const gko::size_type&,
                               const gko::LinOp* residual,
                               const gko::LinOp* solution,
                               const gko::LinOp* residual_norm) const override
    {
        if (residual_norm) {
            rec_res_norms.push_back(utils::get_first_element(
                gko::as<real_vec<ValueType>>(residual_norm)));
        } else {
            rec_res_norms.push_back(
                utils::compute_norm(gko::as<vec<ValueType>>(residual)));
        }
        if (solution) {
            true_res_norms.push_back(utils::compute_residual_norm(
                matrix, b, gko::as<vec<ValueType>>(solution)));
        } else {
            true_res_norms.push_back(-1.0);
        }
    }

    ResidualLogger(const gko::LinOp* matrix, const vec<ValueType>* b)
        : gko::log::Logger(gko::log::Logger::iteration_complete_mask),
          matrix{matrix},
          b{b}
    {}

    void write_data(std::ostream& ostream)
    {
        ostream << "Recurrent Residual Norms: " << std::endl;
        ostream << "[" << std::endl;
        for (const auto& entry : rec_res_norms) {
            ostream << "\t" << entry << std::endl;
        }
        ostream << "];" << std::endl;

        ostream << "True Residual Norms: " << std::endl;
        ostream << "[" << std::endl;
        for (const auto& entry : true_res_norms) {
            ostream << "\t" << entry << std::endl;
        }
        ostream << "];" << std::endl;
    }

private:
    const gko::LinOp* matrix;
    const vec<ValueType>* b;
    mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
    mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};


}  // namespace loggers


namespace {

Print usage help

void print_usage(const char* filename)
{
    std::cerr << "Usage: " << filename << " [executor] [matrix file]"
              << std::endl;
    std::cerr << "matrix file should be a file in matrix market format. "
                 "The file data/A.mtx is provided as an example."
              << std::endl;
    std::exit(-1);
}


template <typename ValueType>
void print_vector(const gko::matrix::Dense<ValueType>* vec)
{
    auto elements_to_print = std::min(gko::size_type(10), vec->get_size()[0]);
    std::cout << "[" << std::endl;
    for (int i = 0; i < elements_to_print; ++i) {
        std::cout << "\t" << vec->at(i) << std::endl;
    }
    std::cout << "];" << std::endl;
}


}  // namespace


int main(int argc, char* argv[])
{

Parametrize the benchmark here Pick a value type

    using ValueType = double;
    using IndexType = int;

Pick a matrix format

    using mtx = gko::matrix::Csr<ValueType, IndexType>;

Pick a solver

    using solver = gko::solver::Cg<ValueType>;

Pick a preconditioner type

    using preconditioner = gko::matrix::IdentityFactory<ValueType>;

Pick a residual norm reduction value

    const gko::remove_complex<ValueType> reduction_factor = 1e-12;

Pick an output file name

    const auto of_name = "log.txt";

Simple shortcut

    using vec = gko::matrix::Dense<ValueType>;

Print version information

    std::cout << gko::version_info::get() << std::endl;

Figure out where to run the code

    if (argc == 2 && (std::string(argv[1]) == "--help")) {
        std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
        std::exit(-1);
    }

Figure out where to run the code

    const auto executor_string = argc >= 2 ? argv[1] : "reference";
    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
        exec_map{
            {"omp", [] { return gko::OmpExecutor::create(); }},
            {"cuda",
             [] {
                 return gko::CudaExecutor::create(0,
                                                  gko::OmpExecutor::create());
             }},
            {"hip",
             [] {
                 return gko::HipExecutor::create(0, gko::OmpExecutor::create());
             }},
            {"dpcpp",
             [] {
                 return gko::DpcppExecutor::create(0,
                                                   gko::OmpExecutor::create());
             }},
            {"reference", [] { return gko::ReferenceExecutor::create(); }}};

executor where Ginkgo will perform the computation

    const auto exec = exec_map.at(executor_string)();  // throws if not valid

Read the input matrix file directory

    std::string input_mtx = "data/A.mtx";
    if (argc == 3) {
        input_mtx = std::string(argv[2]);
    }

Read data: A is read from disk Create a StorageLogger to track the size of A

    auto storage_logger = std::make_shared<loggers::StorageLogger>();

Add the logger to the executor

    exec->add_logger(storage_logger);

Read the matrix A from file

    auto A = gko::share(gko::read<mtx>(std::ifstream(input_mtx), exec));

Remove the storage logger

    exec->remove_logger(storage_logger);

Pick a maximum iteration count

    const auto max_iters = A->get_size()[0];

Generate b and x vectors

    auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
    auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);

Declare the solver factory. The preconditioner’s arguments should be adapted if needed.

    auto solver_factory =
        solver::build()
            .with_criteria(
                gko::stop::ResidualNorm<ValueType>::build()
                    .with_reduction_factor(reduction_factor),
                gko::stop::Iteration::build().with_max_iters(max_iters))
            .with_preconditioner(preconditioner::create(exec))
            .on(exec);

Declare the output file for all our loggers

    std::ofstream output_file(of_name);

Do a warmup run

    {

Clone x to not overwrite the original one

        auto x_clone = gko::clone(x);

Generate and call apply on a solver

        solver_factory->generate(A)->apply(b, x_clone);
        exec->synchronize();
    }

Do a timed run

    {

Clone x to not overwrite the original one

        auto x_clone = gko::clone(x);

Synchronize ensures no operation are ongoing

        exec->synchronize();

Time before generate

        auto g_tic = std::chrono::steady_clock::now();

Generate a solver

        auto generated_solver = solver_factory->generate(A);
        exec->synchronize();

Time after generate

        auto g_tac = std::chrono::steady_clock::now();

Compute the generation time

        auto generate_time =
            std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);

Write the generate time to the output file

        output_file << "Generate time (ns): " << generate_time.count()
                    << std::endl;

Similarly time the apply

        exec->synchronize();
        auto a_tic = std::chrono::steady_clock::now();
        generated_solver->apply(b, x_clone);
        exec->synchronize();
        auto a_tac = std::chrono::steady_clock::now();
        auto apply_time =
            std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
        output_file << "Apply time (ns): " << apply_time.count() << std::endl;

Compute the residual norm

        auto residual =
            utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
        output_file << "Residual_norm: " << residual << std::endl;
    }

Log the internal operations using the OperationLogger without timing

    {

Create an OperationLogger to analyze the generate step

        auto gen_logger = std::make_shared<loggers::OperationLogger>();

Add the generate logger to the executor

        exec->add_logger(gen_logger);

Generate a solver

        auto generated_solver = solver_factory->generate(A);

Remove the generate logger from the executor

        exec->remove_logger(gen_logger);

Write the data to the output file

        output_file << "Generate operations times (ns):" << std::endl;
        gen_logger->write_data(output_file);

Create an OperationLogger to analyze the apply step

        auto apply_logger = std::make_shared<loggers::OperationLogger>();
        exec->add_logger(apply_logger);

Create a ResidualLogger to log the recurrent residual

        auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
            A.get(), b.get());
        generated_solver->add_logger(res_logger);

Solve the system

        generated_solver->apply(b, x);
        exec->remove_logger(apply_logger);

Write the data to the output file

        output_file << "Apply operations times (ns):" << std::endl;
        apply_logger->write_data(output_file);
        res_logger->write_data(output_file);
    }

Print solution

    std::cout << "Solution, first ten entries: \n";
    print_vector(x.get());

Print output file location

    std::cout << "The performance and residual data can be found in " << of_name
              << std::endl;
}

Results#

This is the expected standard output:

Solution, first ten entries:
[
	0.252218
	0.108645
	0.0662811
	0.0630433
	0.0384088
	0.0396536
	0.0402648
	0.0338935
	0.0193098
	0.0234653
];
The performance and residual data can be found in log.txt

Here is a sample output in the file log.txt:

Generate time (ns): 861
Apply time (ns): 108144
Residual_norm: 2.10788e-15
Generate operations times (ns):
Apply operations times (ns):
	allocate: 14991
	cg::initialize#8: 872
	cg::step_1#5: 7683
	cg::step_2#7: 7756
	copy: 7751
	csr::advanced_spmv#5: 21819
	csr::spmv#3: 20429
	dense::compute_dot#3: 18043
	dense::compute_norm2#2: 16726
	free: 8857
	residual_norm::residual_norm#9: 3614
Recurrent Residual Norms:
[
	4.3589
	2.30455
	1.46771
	0.984875
	0.741833
	0.513623
	0.384165
	0.316439
	0.227709
	0.170312
	0.0973722
	0.0616831
	0.0454123
	0.031953
	0.0161606
	0.00657015
	0.00264367
	0.000858809
	0.000286461
	1.64195e-15
];
True Residual Norms:
[
	4.3589
	2.30455
	1.46771
	0.984875
	0.741833
	0.513623
	0.384165
	0.316439
	0.227709
	0.170312
	0.0973722
	0.0616831
	0.0454123
	0.031953
	0.0161606
	0.00657015
	0.00264367
	0.000858809
	0.000286461
	2.10788e-15
];

The plain program#