Find the smallest polytope (Principal Convex Hull) that contains most of the data

fit_pch() fits a polytope (Principal Convex Hull) to data using PCHA algorithm. All of the listed functions take input matrix dim(variables/dimentions, examples) and return archetype positions (XC) of dim(variables/dimentions, archetypes).

k_fit_pch() finds polytopes of k dimensions in the data. This function applies fit_pch() to different k-s.

fit_pch_bootstrap() uses resampling data with or without replacement (bootstrapping) to find robust positions of archetypes of a polytope (Principal Convex Hull) describing the data. This function uses fit_pch_resample().

average_pch_fits() averages archetypes positions and relevant metrics across polytope fit obtained using bootstraping. Uses output of fit_pch_bootstrap() directly.

fit_pch_resample() takes one sample of the data and fits a polytope (Principal Convex Hull) to data. This function uses fit_pch().

randomise_fit_pch() helps answer the question "how likely you are to observe data shaped as polytope given no relationship between variables?" This function disrupts the relationships between variables, find a polytope that best describes the data. It calculates variance explained, t-ratio of polytope volume to convex hull volume. Optionally, it uses bootstrapping of data points to measure variance in archetype positions. This function uses randomise_fit_pch1(). Number of archetypes is determined automatically from arc_data.

randomise_fit_pch1() disrupts the relationships between variables (one sampling iteration), keeping the distribution of each variable constant, and fits a polytope (Principal Convex Hull) to data. This function uses rand_var and fit_pch().

fit_convhulln() computes smallest convex hull that encloses a set of points using convhulln and returns positions of convex hull points (positions = TRUE).

merge_arch_dist() calculates distance to archtypes, and merges distance matrix to data that was used to identify archetypes, optionally adds other features of data points (through colData).

plot.r_pch_fit() plot distribution of t-ratio, total variance in archetype positions and variance explained used to calculate empirical p-value

plot_dim_var() plot distribution of variance in archetype positions in each dimension and corresponding empirical p-values

pch_fit() a constructor function for the "pch_fit" class

subset.k_pch_fit() subsetting method for k_pch_fit object, allows selecting specific number of archetypes from k_fit_pch function output. E.g. useful to do PCA and UMAP projection for selected number of archetypes to aviod recalculation.

fit_pch(data, noc = as.integer(3), I = NULL, U = NULL, delta = 0,
  verbose = FALSE, conv_crit = 1e-04, maxiter = 500,
  check_installed = T, order_by = seq(1, nrow(data)),
  order_type = c("align", "cosine", "side"),
  volume_ratio = c("t_ratio", "variance_ratio", "none"),
  converge_else_fail = TRUE, var_in_dims = FALSE,
  normalise_var = TRUE, method = c("pcha", "kmeans", "louvain",
  "poisson_aa"), method_options = list())

k_fit_pch(data, ks = 2:4, check_installed = TRUE, bootstrap = FALSE,
  bootstrap_N = 10, sample_prop = 0.65, bootstrap_type = c("s", "m",
  "cmq")[1], seed = 345, simplex = FALSE, var_in_dims = FALSE,
  normalise_var = TRUE, ...)

fit_pch_bootstrap(data, n = 3, sample_prop = NULL,
  check_installed = T, type = c("s", "m", "cmq")[1],
  clust_options = list(), seed = 235, replace = FALSE,
  average = FALSE, order_type = c("cosine", "side", "align")[3],
  var_in_dims = TRUE, normalise_var = TRUE, reference = FALSE, ...)

average_pch_fits(res, XC_array = NULL)

fit_pch_resample(i = 1, data, sample_prop = NULL, replace = FALSE,
  var_in_dims = var_in_dims, normalise_var = normalise_var, ...)

randomise_fit_pch(data, arc_data, n_rand = 3, replace = FALSE,
  bootstrap_N = c(NA, 50)[1], seed = 435, volume_ratio = c("t_ratio",
  "variance_ratio", "none")[1], maxiter = 1000, delta = 0,
  order_type = "align", type = c("s", "m", "cmq")[1],
  clust_options = list(), normalise_var = TRUE, check_installed = T,
  ...)

randomise_fit_pch1(i = 1, data, ks = 2:4, replace = FALSE,
  prob = NULL, bootstrap_N = NA, seed = 435, sample_prop = 0.65,
  bootstrap_type = c("s", "m", "cmq")[1], return_data = FALSE,
  return_arc = FALSE, bootstrap_average = FALSE,
  volume_ratio = c("t_ratio", "variance_ratio", "none")[1],
  var_in_dims = TRUE, normalise_var = TRUE, ...)

fit_convhulln(data, positions = TRUE)

merge_arch_dist(arc_data, data, feature_data, colData = NULL, colData_id,
  dist_metric = c("euclidean", "arch_weights")[1], rank = TRUE)

# S3 method for k_pch_fit
c(...)

# S3 method for pch_fit
print(res)

# S3 method for k_pch_fit
print(res)

# S3 method for b_pch_fit
print(res)

# S3 method for r_pch_fit
print(res)

# S3 method for r_pch_fit
plot(rand_arch, ks = unique(rand_arch$rand_dist$k),
  type = c("total_var", "varexpl", "t_ratio"), colors = c("#1F77B4",
  "#D62728", "#2CA02C", "#17BED0", "#006400", "#FF7E0F"), nudge_y = 0.8,
  nudge_x = 0.1, text_lab_size = 4, line_size = 0.5)

plot_dim_var(rand_arch, ks = unique(rand_arch$rand_dist$k),
  dim_names = c("V1", "V2", "V3", "V4", "V5", "V6"),
  colors = c("#1F77B4", "#D62728", "#2CA02C", "#17BED0", "#006400",
  "#FF7E0F"), nudge_y = 0.5, nudge_x = 0.5, text_lab_size = 4,
  line_size = 0.5)

pch_fit(XC, S, C, SSE, varexpl, arc_vol, hull_vol, t_ratio, var_vert,
  var_dim, total_var, summary, call)

subset.k_pch_fit(arc_data, ks)

Arguments

data	numeric matrix or object coercible to matrix in which to find archetypes, dim(variables/dimentions, examples)
noc	integer, number of archetypes to find
I	vector, entries of data to use for dictionary in C (optional)
U	vector, entries of data to model in S (optional)
delta	parameter that inflates original polytope(simplex) fit such that it may contain more points of the dataset
verbose	if TRUE display messages
conv_crit	The convergence criteria (default: 10^-4 relative change in SSE). Reduce to 1e-3 for reduced computation time but more approximate results. Increase to 1e-6 for improved accuracy (python and Matlab default). 1e-4 gives very similar results to 1e-5 or 1e-6 on datasets I looked at.
maxiter	maximum number of iterations (default: 500 iterations)
check_installed	if TRUE, check if python module py_pcha is found. Useful to set to FALSE for running analysis or within other functions
order_by	integer, dimensions to be used for ordering archetypes. Archetypes are ordered by angle (cosine) between c(1, 1) vector and a vector pointing to that archetype. Additional step finds when archetype vector is to the left (counter-clockwise) of the c(1, 1) vector. When bootstraping archetypes can be aligned to reference and ordered (order_type == "align") by these dimensions.
order_type	order archetypes by: cosine distance from c(1,1, ..., 1) vector ("cosine"), dot product that measures position to each side of the c(1,1) vector ("side"), align positions to reference when bootstraping(fit using all data, "align"). See align_arc When order_type is "align" archetypes are ordered by "cosine" first.
volume_ratio	find volume of the convex hull of the data and the t-ratio ("t_ratio") or ratio of variance of archetype positions to variance of data ("variance_ratio") or do nothing ("none")? Caution! Geometric figure should be at least simplex: qhull algorhirm will fail to find convex hull of flat 2D shapes in 3D, 3D shapes in 4D and so on. So, for calculating this dimentions are selected based order of rows in data. Makes sense for principal components but not for original data. Caution 2! Computation time and memory use increse very quickly with dimensions. Do not use for more than 7-8 dimentions.
converge_else_fail	throw an error and stop execution if PCHA did not converge in `maxiter` steps.
var_in_dims	calculate variance in dimensions across archetypes in addition to variance in archetypes.
normalise_var	normalise variance in position of archetypes by variance in data in each dimention
method	method for archetypal analysis: PCHA (default), kmeans, FindClusters (louvain). Non-default methods use `data` and `noc`for data matrix and the number of archetypes, but require to pass additional arguments via method_options.
method_options	additional arguments for non-default method, named list: `list(arg = "value")`.
ks	integer vector, dimensions of polytopes to be fit to data
bootstrap	`k_fit_pch()`: use bootstrap to find average positions for each k? Also returns variability in archetype position to aid the selection of k. At excessive k position vary more.
bootstrap_N	randomise_fit_pch1() and k_fit_pch(): integer, number of bootstrap samples on random data to measure variability in archetype positions. Set to NA fit once to all data instead of bootstraping. When this option is positive seed bootstrap_seed and sample_prop must be provided.
sample_prop	either NULL or the proportion of dataset that should be included in each sample. If NULL the polytope fitting algorithm is run n times on the same data which is useful for evaluating how often the algorithm gets stuck in local optima.
bootstrap_type	`randomise_fit_pch1()` and `k_fit_pch()`: parallel processing type when bootstraping. Caution: avoid nested parallel processing, do not use "m" and "cmq" inside other parallel functions.
seed	seed for reproducible random number generation. Works for all types of processing.
simplex	when testing multiple k using `k_fit_pch()` match dimensions to the number of archetypes? Use only on ordered principal components. If FALSE try all k for all dimensions in data. If simplex == TRUE test only simplex shapes (k=3 in 2D, k=4 in 3D, k=5 in 4D...). This assumes order of columns which is valid for principal components but may not be valid for untransformed data.
n	number of samples to be taken when bootstraping
type	one of s, m, cmq. s means single core processing using lapply. m means multi-core parallel procession using parLapply. cmq means multi-node parallel processing on a computing cluster using clustermq package. "See also" for details.
clust_options	list of options for parallel processing. The default for "m" is list(cores = parallel::detectCores()-1, cluster_type = "PSOCK"). The default for "cmq" is list(memory = 2000, template = list(), n_jobs = 10, fail_on_error = FALSE). Change these options as required.
replace	should resampling be with replacement? passed to sample.int
average	average archetype positions and varexpl? By default FALSE, return all fits to resampled data.
reference	align archetypes (order_type="align") against reference polytope based on all data (TRUE), or align to the polytope from the first bootstrap iteration (FALSE). Second option can save time for large datasets
XC_array	Used by average_pch_fits() inside fit_pch_bootstrap(). You should not to use it in most cases.
i	iteration number
arc_data	observed polytope that describes the data. Class pch_fit, b_pch_fit and k_pch_fit objects generated by `fit_pch()` or `fit_pch_bootstrap()`
n_rand	number of randomisation samples
prob	a vector of probability weights for obtaining the elements of the vector being sampled. Passed to `(sample.int`.
return_data	return randomised data?
return_arc	return archetype positions in randomised data?
bootstrap_average	`randomise_fit_pch1()`: average positions and summary statistics when bootstraping? Passed to `average` argument of `fit_pch_bootstrap()`. When multiple ks this defaults to TRUE.
positions	return positions of convex hull points?
feature_data	matrix with dim(dimensions, examples) where rownames are feature names and colnames are sample_id.
colData	annotations of examples in feature_data - dim(examples, dimensions), e.g. colData in SingleCellExperiment object or output of find_set_activity_AUCell.
colData_id	column in colData that contains values matching colnames of feature_data.
dist_metric	how to describe distance to archetypes. Currently euclidean distance and archetype weights are implemented. Archetypes weights come from `arc_data$S` matrix so bootstrapped archetype positions cannot be used - only single fit to all data.
rank	rank by distance metric (euclidean distance)?
replace	fit_pch_resample/fit_pch_bootstrap: TRUE and FALSE are passed to `sample.int` for density-based sampling, "geo_sketch" tells to sample examples using geosketch method preserving geometry of the data. See geo_sketch.

Value

fit_pch(): object of class pch_fit (list) containing the following elements: XC - numeric matrix, dim(I, noc)/dim(dimensions, archetypes) feature matrix (i.e. XC=data[,I]*C forming the archetypes), or matrix with cluster centers (method = "kmeans"); S - numeric matrix of archetype values per example (e.g. cell) or cluster membership (method = "kmeans"), dim(noc, length(U)) matrix, S>=0 |S_j|_1=1; C - numeric matrix, dim(noc, length(U)) matrix, S>=0 |S_j|_1=1; SSE - numeric vector (1L), Sum of Squared Errors; varexpl - numeric vector (1L), Percent variation explained by the model. hull_vol - numeric vector (1L), Volume of convex hull of the data. arc_vol - numeric vector (1L), Volume of polytope enclosed by archetypes. t_ratio - numeric vector (1L), Ratio of arc_vol to hull_vol var_vert - numeric vector (noc L), Variance in position of each archetype. var_dim - numeric vector (noc L), Variance in positions of archetypes in each dimension. total_var - numeric vector (1L), Mean variance in position of all archetypes. summary - data.table with varexpl, t_ratio, total_var and noc for ease of combining multiple shape fits. call - function call.

k_fit_pch(): object of class k_pch_fit (list) containing the same elements as pch_fit, but each is either a list of pch_fit elements (e.g. list of ks number of XC matrices) or a vector (which pch_fit element is one number). When length(ks) = 1 returns pch_fit.

fit_pch_bootstrap() object of class b_pch_fit (list) containing the same elements as pch_fit, but each is either a list of pch_fit elements (e.g. list of n number of XC matrices) or a vector (which pch_fit element is one number).

average_pch_fits() object of class pch_fit

fit_pch_resample() object of class pch_fit

randomise_fit_pch() S3 "r_pch_fit" object (list) containing: 1. data table with columns variance explained (rand_varexpl), t-ratio (rand_t_ratio), variance in positions of archetypes (total_var) and number of archetypes (k) for n_rand samples; 2. empirical p-value for those parameters for each number of archetypes.

randomise_fit_pch1(): list containing function call, summary of the sample, optional data and optional position of archetypes.

fit_convhulln() list with 3 elements: hull - matrix storing positions of points dim(points, dimensions)); area - surface area for 3D or higher and perimeter for 2D; vol - volume for 3D and surface area for 2D.

merge_arch_dist() list: data.table with samples in rows (speficied by sample_id column) and features in columns (including distance to archetypes); and character vectors listing names of archetypes, feature columns and colData columns.

Details

fit_pch() provides an R interface to python implementation of PCHA algorithm (Principal Convex Hull Analysis) by Ulf Aslak (https://github.com/ulfaslak/py_pcha) which was originally developed for Archetypal Analysis by Mørup et. al.

Examples

set.seed(4355)
archetypes = generate_arc(arc_coord = list(c(5, 0), c(-10, 15), c(-30, -20)),
                          mean = 0, sd = 1, N_dim = 2)
#> Error in generate_arc(arc_coord = list(c(5, 0), c(-10, 15), c(-30, -20)),     mean = 0, sd = 1, N_dim = 2): unused argument (N_dim = 2)
data = generate_data(archetypes, N_examples = 1e4, jiiter = 0.04, size = 0.9)
#> Error in ncol(archetypes): object 'archetypes' not found
dim(data)
#> NULL
# Find 3 archetypes in this data
arc = fit_pch(data, noc=as.integer(3), delta=0)
#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'
# Fit the model 3 times without resampling to test convergence of the algorithm.
arc_rob = fit_pch_bootstrap(data, n = 3, sample_prop = NULL,
                         noc=as.integer(3), delta=0)
#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'
# Fit 10 models to resampled datasets each time looking at 70% of examples.
arc_data = fit_pch_bootstrap(data, n = 10, sample_prop = 0.7,
                         noc=as.integer(3), delta=0)
#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'

# Use local parallel processing to fit 10 models to resampled datasets each time looking at 70% of examples.
arc_data = fit_pch_bootstrap(data, n = 10, sample_prop = 0.7,
                         noc=as.integer(3), delta=0, type = "m")
#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'

# Fit models with 2-4 archetypes
arc_ks = k_fit_pch(data, ks = 2:4, check_installed = T, delta=0)
#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'

# Evaluate how much archetypes vary in randomised data, (variable shuffled
# without replacement)
rand3 = randomise_fit_pch1(i = 1, data, true_fit = NULL,
    replace = FALSE, bootstrap_N = 200, seed = 2543,
    return_data = T, return_arc = T, sample_prop = 0.65,
    order_type = "align", noc = as.integer(3),
    delta = 0.1, bootstrap_type = "cmq")
#> Error in base::apply(X = X, MARGIN = MARGIN, FUN = FUN, ...): dim(X) must have a positive length
p = plot_arc(arc_data, data, which_dimensions = 1:2, line_size = 1)
#> Error in is(arc_data, "pch_fit"): object 'arc_data' not found
# plot shapes of observed data and randomised data side-by-side using cowplot
library(cowplot)
# create compound plot
p_all = plot_grid(plotlist = list(p + theme(legend.position = "none"),
                                  plot_arc(rand3$arc_data, rand3$data,
                                    which_dimensions = 1:2, line_size = 1) +
                                    theme(legend.position = "none")))
#> Error in plot_grid(plotlist = list(p + theme(legend.position = "none"),     plot_arc(rand3$arc_data, rand3$data, which_dimensions = 1:2,         line_size = 1) + theme(legend.position = "none"))): object 'p' not found
legend = get_legend(p)
#> Error in as_gtable(plot): object 'p' not found
# add legend to plot
plot_grid(p_all, legend, rel_widths = c(3.2, .6))
#> Error in plot_grid(p_all, legend, rel_widths = c(3.2, 0.6)): object 'p_all' not found

Find the smallest polytope (Principal Convex Hull) that contains most of the data

Arguments

Value

Details

See also

Examples

Contents

Author