Application Notes for oneMKL Summary Statistics

ID 772991
Date 3/31/2023
Public
Document Table of Contents

Computing Quantiles for Streaming Data with VSL_SS_METHOD_SQUANTS_ZW

Use the VSL_SS_METHOD_SQUANTS_ZW method to compute quantiles for streaming data [Zhang2007].

The algorithm supports two usage models of data processing for a given task dimension:

  1. Dataset is available as a single block. The number of observations is known by the time you pass the data into the library.

  2. Dataset is available as a sequence of blocks. The number of observations in each chunk is available prior to passing the data into the library. The number of observations can vary from block to block. The number of blocks may be unknown in advance.

The accuracy of quantile estimation ε, which is a parameter of the algorithm, is packed into the params array and passed to the library using the EditStreamQuantiles editor. Before computation, the algorithm checks the value of ε. If ε ≤ 0 or ε > 1, the library terminates the computation and returns the VSL_SS_ERROR_BAD_STREAM_QUANT_PARAMS error status. For correct values of ε, the algorithm returns a quantile estimate located in the interval [r-εn, r+εn]

where

  1. r is the rank of the desired quantile.

  2. n is the total number of available observations.

For details, see [Zhang2007].

If a single data block is available, the quantiles are computed as shown in the example below:

#include "mkl_vsl.h"
#include <stdio.h>
#define DIM 3      /* dimension of the task */
#define N   1000   /* number of observations */
#define M   100    /* number of quantiles to compute */
#define EPS 0.01   /* accuracy of quantile computation */
 
int main()
{
   int i, status;
   VSLSSTaskPtr task;
   float x[DIM][N];  /* matrix of observations */
   float q_order[M], quants[M];
   float params;
   MKL_INT q_order_n;
   MKL_INT p, n, nparams, xstorage;
   int indices[DIM]={1,0,0}; /* the first vector component is processed */
  
   /* Parameters of the task and initialization */
   p = DIM;
   n = N;
 
   q_order_n = M;
   xstorage  = VSL_SS_MATRIX_STORAGE_ROWS;
   params    = EPS;
   nparams   = VSL_SS_SQUANTS_ZW_PARAMS_N;
 
   /* Calculate percentiles */
   for ( i = 0; i < M; i++ ) q_order[i] = (float)i / (float)M;
  
   /* Create a task */
   status = vslsSSNewTask( &task, &p, &n, &xstorage, x, 0, indices );
 
   /* Initialize the task parameters */
   status = vslsSSEditStreamQuantiles( task, &q_order_n, q_order,
                                       quants, &nparams, &params );
 
   /* Compute the percentiles with accuracy eps */
   status = vslsSSCompute( task, VSL_SS_STREAM_QUANTS,
                                 VSL_SS_METHOD_SQUANTS_ZW );
 
   /* Deallocate the task resources */
   status = vslSSDeleteTask( &task );
 
   return 0;
}

If the dataset is available as a sequence of blocks, the quantiles are computed as follows: 

#include "mkl_vsl.h"
#include <stdio.h>
#define DIM 3     /* dimension of the task */
#define N   1000  /* number of observations */
#define M   100   /* number of quantiles to compute */
#define EPS 0.01  /* accuracy of quantile computation */
#define NBLOCKS 5 /* number of data blocks of size N */
 
int main()
{
   int i, status;
   VSLSSTaskPtr task;
   float x[DIM][N];  /* matrix of observations */
   float q_order[M], quants[M];
   float params;
   MKL_INT q_order_n;
   MKL_INT p, n, nparams, xstorage;
   int indices[DIM]={1,0,0}; /* the first vector component is processed */
  
   /* Parameters of the task and initialization */
   p = DIM;
   n = N;
   q_order_n = M;
   xstorage  = VSL_SS_MATRIX_STORAGE_ROWS;
   params    = EPS;
   nparams   = VSL_SS_SQUANTS_ZW_PARAMS_N;
 
   /* Calculate percentiles */
   for ( i = 0; i < M; i++ ) q_order[i] = (float)i / (float)M;
  
   /* Create a task */
   status = vslsSSNewTask( &task, &p, &n, &xstorage, x, 0, indices );
 
   /* Initialize the task parameters */
   status = vslsSSEditStreamQuantiles( task, &q_order_n, q_order,
                                       quants, &nparams, &params );
   for ( i = 0; ; i++ )
   {
      /* Update the internal data structures of the algorithm */
       status = vslsSSCompute( task, VSL_SS_STREAM_QUANTS,
                                     VSL_SS_METHOD_SQUANTS_ZW FAST );
       if ( ++i >= NBLOCKS ) break;
       GetNextDataBlock( x ) ;
   }
 
    /* Compute the percentiles with accuracy eps */
    n = 0;
    status = vslsSSCompute( task, VSL_SS_STREAM_QUANTS,
                                  VSL_SS_METHOD_SQUANTS_ZW );
 
    /* Deallocate the task resources */
    status = vslSSDeleteTask( &task );
 
    return 0;
}

If intermediate quantile estimates are not required, you can analyze your data using the fast computation method VSL_SS_METHOD_SQUANTS_ZW_FAST. In this mode, the algorithm only updates internal data structures when processing the next available block. Actual estimates of the required quantiles are computed after the whole sequence of the blocks is processed. An additional call to the Compute routine with method VSL_SS_METHOD_SQUANTS_ZW returns the final estimate. Before making the final call to the routine, set to zero the variable that holds the number of observations and is registered in the library.

If intermediate estimates of quantiles are required, use the VSL_SS_METHOD_SQUANTS method. In this case, you do not need the additional call to the Compute routine:

#include "mkl_vsl.h"
 
#include <stdio.h>
#define DIM 3      /* dimension of the task */
#define N   1000   /* number of observations */
#define M   100    /* number of quantiles to compute */
#define EPS 0.01   /* accuracy of quantile computation */
#define NBLOCKS 5  /* number of data blocks of size N */
 
int main()
{
    int i, status;
    VSLSSTaskPtr task;
    float x[DIM][N];  /* matrix of observations */
    float q_order[M], quants[M];
    float params;
    MKL_INT quant_order_n;
    MKL_INT p, n, n_params, xstorage;
    int indices[DIM]={1,0,0}; /* the first vector component is processed */
  
    /* Parameters of the task and initialization */
    p = DIM;
    n = N;
    q_order_n = M;
    xstorage  = VSL_SS_MATRIX_STORAGE_ROWS;
    params    = EPS;
    params_n  = VSL_SS_SQUANTS_ZW_PARAMS_N;
 
    /* Calculate percentiles */
    for ( i = 0; i < M; i++ ) q_order[i] = (float)i / (float)M;
  
    /* Create a task */
    status = vslsSSNewTask( &task, &p, &n, &xstorage, x, 0, indices );
   
    /* Initialize the task parameters */
    status =  vslsSSEditStreamQuantiles( task, &q_order_n, q_order,
                                         quants, &n_params, &params );
 
    for ( i = 0; ; i++ )
    {
        /* Compute the percentiles with accuracy eps */
        status = vslsSSCompute( task, VSL_SS_STREAM_QUANTS,
                                      VSL_SS_METHOD_SQUANTS_ZW );
        if ( ++i >= NBLOCKS ) break;
        GetNextDataBlock( x ) ;
    }
 
     /* Deallocate the task resources */
    status = vslSSDeleteTask( &task );
 
    return 0;
}