fit_pch.Rd
fit_pch()
fits a polytope (Principal Convex Hull) to data using PCHA algorithm. All of the listed functions take input matrix dim(variables/dimentions, examples) and return archetype positions (XC) of dim(variables/dimentions, archetypes).
k_fit_pch()
finds polytopes of k dimensions in the data. This function applies fit_pch()
to different k-s.
fit_pch_bootstrap()
uses resampling data with or without replacement (bootstrapping) to find robust positions of archetypes of a polytope (Principal Convex Hull) describing the data. This function uses fit_pch_resample()
.
average_pch_fits()
averages archetypes positions and relevant metrics across polytope fit obtained using bootstraping. Uses output of fit_pch_bootstrap() directly.
fit_pch_resample()
takes one sample of the data and fits a polytope (Principal Convex Hull) to data. This function uses fit_pch()
.
randomise_fit_pch()
helps answer the question "how likely you are to observe data shaped as polytope given no relationship between variables?" This function disrupts the relationships between variables, find a polytope that best describes the data. It calculates variance explained, t-ratio of polytope volume to convex hull volume. Optionally, it uses bootstrapping of data points to measure variance in archetype positions. This function uses randomise_fit_pch1()
. Number of archetypes is determined automatically from arc_data
.
randomise_fit_pch1()
disrupts the relationships between variables (one sampling iteration), keeping the distribution of each variable constant, and fits a polytope (Principal Convex Hull) to data. This function uses rand_var
and fit_pch()
.
fit_convhulln()
computes smallest convex hull that encloses a set of points using convhulln
and returns positions of convex hull points (positions = TRUE).
merge_arch_dist()
calculates distance to archtypes, and merges distance matrix to data that was used to identify archetypes, optionally adds other features of data points (through colData).
plot.r_pch_fit() plot distribution of t-ratio, total variance in archetype positions and variance explained used to calculate empirical p-value
plot_dim_var() plot distribution of variance in archetype positions in each dimension and corresponding empirical p-values
pch_fit()
a constructor function for the "pch_fit" class
subset.k_pch_fit()
subsetting method for k_pch_fit object, allows selecting specific number of archetypes from k_fit_pch function output. E.g. useful to do PCA and UMAP projection for selected number of archetypes to aviod recalculation.
fit_pch(data, noc = as.integer(3), I = NULL, U = NULL, delta = 0, verbose = FALSE, conv_crit = 1e-04, maxiter = 500, check_installed = T, order_by = seq(1, nrow(data)), order_type = c("align", "cosine", "side"), volume_ratio = c("t_ratio", "variance_ratio", "none"), converge_else_fail = TRUE, var_in_dims = FALSE, normalise_var = TRUE, method = c("pcha", "kmeans", "louvain", "poisson_aa"), method_options = list()) k_fit_pch(data, ks = 2:4, check_installed = TRUE, bootstrap = FALSE, bootstrap_N = 10, sample_prop = 0.65, bootstrap_type = c("s", "m", "cmq")[1], seed = 345, simplex = FALSE, var_in_dims = FALSE, normalise_var = TRUE, ...) fit_pch_bootstrap(data, n = 3, sample_prop = NULL, check_installed = T, type = c("s", "m", "cmq")[1], clust_options = list(), seed = 235, replace = FALSE, average = FALSE, order_type = c("cosine", "side", "align")[3], var_in_dims = TRUE, normalise_var = TRUE, reference = FALSE, ...) average_pch_fits(res, XC_array = NULL) fit_pch_resample(i = 1, data, sample_prop = NULL, replace = FALSE, var_in_dims = var_in_dims, normalise_var = normalise_var, ...) randomise_fit_pch(data, arc_data, n_rand = 3, replace = FALSE, bootstrap_N = c(NA, 50)[1], seed = 435, volume_ratio = c("t_ratio", "variance_ratio", "none")[1], maxiter = 1000, delta = 0, order_type = "align", type = c("s", "m", "cmq")[1], clust_options = list(), normalise_var = TRUE, check_installed = T, ...) randomise_fit_pch1(i = 1, data, ks = 2:4, replace = FALSE, prob = NULL, bootstrap_N = NA, seed = 435, sample_prop = 0.65, bootstrap_type = c("s", "m", "cmq")[1], return_data = FALSE, return_arc = FALSE, bootstrap_average = FALSE, volume_ratio = c("t_ratio", "variance_ratio", "none")[1], var_in_dims = TRUE, normalise_var = TRUE, ...) fit_convhulln(data, positions = TRUE) merge_arch_dist(arc_data, data, feature_data, colData = NULL, colData_id, dist_metric = c("euclidean", "arch_weights")[1], rank = TRUE) # S3 method for k_pch_fit c(...) # S3 method for pch_fit print(res) # S3 method for k_pch_fit print(res) # S3 method for b_pch_fit print(res) # S3 method for r_pch_fit print(res) # S3 method for r_pch_fit plot(rand_arch, ks = unique(rand_arch$rand_dist$k), type = c("total_var", "varexpl", "t_ratio"), colors = c("#1F77B4", "#D62728", "#2CA02C", "#17BED0", "#006400", "#FF7E0F"), nudge_y = 0.8, nudge_x = 0.1, text_lab_size = 4, line_size = 0.5) plot_dim_var(rand_arch, ks = unique(rand_arch$rand_dist$k), dim_names = c("V1", "V2", "V3", "V4", "V5", "V6"), colors = c("#1F77B4", "#D62728", "#2CA02C", "#17BED0", "#006400", "#FF7E0F"), nudge_y = 0.5, nudge_x = 0.5, text_lab_size = 4, line_size = 0.5) pch_fit(XC, S, C, SSE, varexpl, arc_vol, hull_vol, t_ratio, var_vert, var_dim, total_var, summary, call) subset.k_pch_fit(arc_data, ks)
data | numeric matrix or object coercible to matrix in which to find archetypes, dim(variables/dimentions, examples) |
---|---|
noc | integer, number of archetypes to find |
I | vector, entries of data to use for dictionary in C (optional) |
U | vector, entries of data to model in S (optional) |
delta | parameter that inflates original polytope(simplex) fit such that it may contain more points of the dataset |
verbose | if TRUE display messages |
conv_crit | The convergence criteria (default: 10^-4 relative change in SSE). Reduce to 1e-3 for reduced computation time but more approximate results. Increase to 1e-6 for improved accuracy (python and Matlab default). 1e-4 gives very similar results to 1e-5 or 1e-6 on datasets I looked at. |
maxiter | maximum number of iterations (default: 500 iterations) |
check_installed | if TRUE, check if python module py_pcha is found. Useful to set to FALSE for running analysis or within other functions |
order_by | integer, dimensions to be used for ordering archetypes. Archetypes are ordered by angle (cosine) between c(1, 1) vector and a vector pointing to that archetype. Additional step finds when archetype vector is to the left (counter-clockwise) of the c(1, 1) vector. When bootstraping archetypes can be aligned to reference and ordered (order_type == "align") by these dimensions. |
order_type | order archetypes by: cosine distance from c(1,1, ..., 1) vector ("cosine"), dot product that measures position to each side of the c(1,1) vector ("side"), align positions to reference when bootstraping(fit using all data, "align"). See align_arc When order_type is "align" archetypes are ordered by "cosine" first. |
volume_ratio | find volume of the convex hull of the data and the t-ratio ("t_ratio") or ratio of variance of archetype positions to variance of data ("variance_ratio") or do nothing ("none")? Caution! Geometric figure should be at least simplex: qhull algorhirm will fail to find convex hull of flat 2D shapes in 3D, 3D shapes in 4D and so on. So, for calculating this dimentions are selected based order of rows in data. Makes sense for principal components but not for original data. Caution 2! Computation time and memory use increse very quickly with dimensions. Do not use for more than 7-8 dimentions. |
converge_else_fail | throw an error and stop execution if PCHA did not converge in |
var_in_dims | calculate variance in dimensions across archetypes in addition to variance in archetypes. |
normalise_var | normalise variance in position of archetypes by variance in data in each dimention |
method | method for archetypal analysis: PCHA (default), kmeans, FindClusters (louvain). Non-default methods use |
method_options | additional arguments for non-default method, named list: |
ks | integer vector, dimensions of polytopes to be fit to data |
bootstrap |
|
bootstrap_N | randomise_fit_pch1() and k_fit_pch(): integer, number of bootstrap samples on random data to measure variability in archetype positions. Set to NA fit once to all data instead of bootstraping. When this option is positive seed bootstrap_seed and sample_prop must be provided. |
sample_prop | either NULL or the proportion of dataset that should be included in each sample. If NULL the polytope fitting algorithm is run n times on the same data which is useful for evaluating how often the algorithm gets stuck in local optima. |
bootstrap_type |
|
seed | seed for reproducible random number generation. Works for all types of processing. |
simplex | when testing multiple k using |
n | number of samples to be taken when bootstraping |
type | one of s, m, cmq. s means single core processing using lapply. m means multi-core parallel procession using parLapply. cmq means multi-node parallel processing on a computing cluster using clustermq package. "See also" for details. |
clust_options | list of options for parallel processing. The default for "m" is list(cores = parallel::detectCores()-1, cluster_type = "PSOCK"). The default for "cmq" is list(memory = 2000, template = list(), n_jobs = 10, fail_on_error = FALSE). Change these options as required. |
replace | should resampling be with replacement? passed to sample.int |
average | average archetype positions and varexpl? By default FALSE, return all fits to resampled data. |
reference | align archetypes (order_type="align") against reference polytope based on all data (TRUE), or align to the polytope from the first bootstrap iteration (FALSE). Second option can save time for large datasets |
XC_array | Used by average_pch_fits() inside fit_pch_bootstrap(). You should not to use it in most cases. |
i | iteration number |
arc_data | observed polytope that describes the data. Class pch_fit, b_pch_fit and k_pch_fit objects generated by |
n_rand | number of randomisation samples |
prob | a vector of probability weights for obtaining the elements of the vector being sampled. Passed to |
return_data | return randomised data? |
return_arc | return archetype positions in randomised data? |
bootstrap_average |
|
positions | return positions of convex hull points? |
feature_data | matrix with dim(dimensions, examples) where rownames are feature names and colnames are sample_id. |
colData | annotations of examples in feature_data - dim(examples, dimensions), e.g. colData in SingleCellExperiment object or output of find_set_activity_AUCell. |
colData_id | column in colData that contains values matching colnames of feature_data. |
dist_metric | how to describe distance to archetypes. Currently euclidean distance and archetype weights are implemented. Archetypes weights come from |
rank | rank by distance metric (euclidean distance)? |
replace | fit_pch_resample/fit_pch_bootstrap: TRUE and FALSE are passed to |
fit_pch()
: object of class pch_fit (list) containing the following elements:
XC - numeric matrix, dim(I, noc)/dim(dimensions, archetypes) feature matrix (i.e. XC=data[,I]*C forming the archetypes), or matrix with cluster centers (method = "kmeans");
S - numeric matrix of archetype values per example (e.g. cell) or cluster membership (method = "kmeans"), dim(noc, length(U)) matrix, S>=0 |S_j|_1=1;
C - numeric matrix, dim(noc, length(U)) matrix, S>=0 |S_j|_1=1;
SSE - numeric vector (1L), Sum of Squared Errors;
varexpl - numeric vector (1L), Percent variation explained by the model.
hull_vol - numeric vector (1L), Volume of convex hull of the data.
arc_vol - numeric vector (1L), Volume of polytope enclosed by archetypes.
t_ratio - numeric vector (1L), Ratio of arc_vol
to hull_vol
var_vert - numeric vector (noc L), Variance in position of each archetype.
var_dim - numeric vector (noc L), Variance in positions of archetypes in each dimension.
total_var - numeric vector (1L), Mean variance in position of all archetypes.
summary - data.table with varexpl, t_ratio, total_var and noc for ease of combining multiple shape fits.
call - function call.
k_fit_pch()
: object of class k_pch_fit (list) containing the same elements as pch_fit, but each is either a list of pch_fit elements (e.g. list of ks number of XC matrices) or a vector (which pch_fit element is one number). When length(ks) = 1 returns pch_fit.
fit_pch_bootstrap()
object of class b_pch_fit (list) containing the same elements as pch_fit, but each is either a list of pch_fit elements (e.g. list of n number of XC matrices) or a vector (which pch_fit element is one number).
average_pch_fits()
object of class pch_fit
fit_pch_resample()
object of class pch_fit
randomise_fit_pch()
S3 "r_pch_fit" object (list) containing: 1. data table with columns variance explained (rand_varexpl), t-ratio (rand_t_ratio), variance in positions of archetypes (total_var) and number of archetypes (k) for n_rand
samples; 2. empirical p-value for those parameters for each number of archetypes.
randomise_fit_pch1()
: list containing function call, summary of the sample, optional data and optional position of archetypes.
fit_convhulln()
list with 3 elements: hull - matrix storing positions of points dim(points, dimensions)); area - surface area for 3D or higher and perimeter for 2D; vol - volume for 3D and surface area for 2D.
merge_arch_dist()
list: data.table with samples in rows (speficied by sample_id column) and features in columns (including distance to archetypes); and character vectors listing names of archetypes, feature columns and colData columns.
fit_pch()
provides an R interface to python implementation of PCHA algorithm (Principal Convex Hull Analysis) by Ulf Aslak (https://github.com/ulfaslak/py_pcha) which was originally developed for Archetypal Analysis by Mørup et. al.
set.seed(4355) archetypes = generate_arc(arc_coord = list(c(5, 0), c(-10, 15), c(-30, -20)), mean = 0, sd = 1, N_dim = 2)#> Error in generate_arc(arc_coord = list(c(5, 0), c(-10, 15), c(-30, -20)), mean = 0, sd = 1, N_dim = 2): unused argument (N_dim = 2)#> Error in ncol(archetypes): object 'archetypes' not founddim(data)#> NULL#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'# Fit the model 3 times without resampling to test convergence of the algorithm. arc_rob = fit_pch_bootstrap(data, n = 3, sample_prop = NULL, noc=as.integer(3), delta=0)#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'# Fit 10 models to resampled datasets each time looking at 70% of examples. arc_data = fit_pch_bootstrap(data, n = 10, sample_prop = 0.7, noc=as.integer(3), delta=0)#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'# Use local parallel processing to fit 10 models to resampled datasets each time looking at 70% of examples. arc_data = fit_pch_bootstrap(data, n = 10, sample_prop = 0.7, noc=as.integer(3), delta=0, type = "m")#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'# Fit models with 2-4 archetypes arc_ks = k_fit_pch(data, ks = 2:4, check_installed = T, delta=0)#> Error in as.vector(x, mode): cannot coerce type 'closure' to vector of type 'any'# Evaluate how much archetypes vary in randomised data, (variable shuffled # without replacement) rand3 = randomise_fit_pch1(i = 1, data, true_fit = NULL, replace = FALSE, bootstrap_N = 200, seed = 2543, return_data = T, return_arc = T, sample_prop = 0.65, order_type = "align", noc = as.integer(3), delta = 0.1, bootstrap_type = "cmq")#> Error in base::apply(X = X, MARGIN = MARGIN, FUN = FUN, ...): dim(X) must have a positive length#> Error in is(arc_data, "pch_fit"): object 'arc_data' not found# plot shapes of observed data and randomised data side-by-side using cowplot library(cowplot) # create compound plot p_all = plot_grid(plotlist = list(p + theme(legend.position = "none"), plot_arc(rand3$arc_data, rand3$data, which_dimensions = 1:2, line_size = 1) + theme(legend.position = "none")))#> Error in plot_grid(plotlist = list(p + theme(legend.position = "none"), plot_arc(rand3$arc_data, rand3$data, which_dimensions = 1:2, line_size = 1) + theme(legend.position = "none"))): object 'p' not found#> Error in as_gtable(plot): object 'p' not found#> Error in plot_grid(p_all, legend, rel_widths = c(3.2, 0.6)): object 'p_all' not found