Cassava IR data

The data is read from .dx files and converted to a dataset list. Also “ppds”, which is numeric, was converted to factor.

setwd("~/Dropbox")
library(metabolomicsUM)
source("Datasets/CassavaPPD/IR/scripts/ir-cassavappd-metadata.R")

ir.cassppd.data.folder = "Datasets/CassavaPPD/IR/data"
ir.cassppd.metadata.file = "Datasets/CassavaPPD/IR/metadata/ir-cassavappd-metadata.csv"

get.metadata.ir.cassavappd(ir.cassppd.data.folder, ir.cassppd.metadata.file)

ir.cassava.ppd = read.dataset.dx(ir.cassppd.data.folder, ir.cassppd.metadata.file, 
                                 type = "ir-spectra", description = "IR for Cassava - PPD",
                                 label.x = "Wavenumber", label.values = "transmittance")

ir.cassava.ppd = convert.to.factor(ir.cassava.ppd, "ppds")

To reduce the number of features in order to make the analysis’ algorithms run faster, smoothing interpolation was used.

sum.dataset(ir.cassava.ppd)
## Dataset summary:
## Valid dataset
## Description:  IR for Cassava - PPD 
## Type of data:  ir-spectra 
## Number of samples:  80 
## Number of data points 3735 
## Number of metadata variables:  3 
## Label of x-axis values:  Wavenumber 
## Label of data points:  transmittance 
## Number of missing values in data:  0 
## Mean of data values:  0.5942576 
## Median of data values:  0.61493 
## Standard deviation:  0.1519456 
## Range of values:  0.11203 0.83064 
## Quantiles: 
##      0%     25%     50%     75%    100% 
## 0.11203 0.48967 0.61493 0.72258 0.83064
ir.cassava.ppd = smoothing.interpolation(ir.cassava.ppd, "bin", 10)
## Loading required package: hyperSpec
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mvtnorm
## Package hyperSpec, version 0.98-20140523
## 
## To get started, try
##    vignette ("introduction", package = "hyperSpec")
##    package?hyperSpec 
##    vignette (package = "hyperSpec")
## 
## If you use this package please cite it appropriately.
##    citation("hyperSpec")
## will give you the correct reference.
## 
## The project homepage is http://hyperspec.r-forge.r-project.org
## Warning in spc.bin(hyper.object, reducing.factor, na.rm = TRUE): Last data
## point averages only 5 points.
sum.dataset(ir.cassava.ppd)
## Dataset summary:
## Valid dataset
## Description:  IR for Cassava - PPD-smoothed with hyperSpec spc.bin 
## Type of data:  undefined 
## Number of samples:  80 
## Number of data points 374 
## Number of metadata variables:  3 
## Label of x-axis values:  Wavenumber 
## Label of data points:  transmittance 
## Number of missing values in data:  0 
## Mean of data values:  0.5942012 
## Median of data values:  0.614718 
## Standard deviation:  0.1518141 
## Range of values:  0.113363 0.829931 
## Quantiles: 
##        0%       25%       50%       75%      100% 
## 0.1133630 0.4898193 0.6147180 0.7223875 0.8299310

No preprocessing used.

UNIVARIATE ANALYSIS

An analysis of variance (ANOVA) was conducted over the data with tukey test also, and this is the top 10 results ordered by p-value:

anova.cassava.varieties = aov.all.vars(ir.cassava.ppd, "varieties")
anova.cassava.varieties[1:10,]
##                       pvalues     logs          fdr
## 3388.4456809052  6.270647e-14 13.20269 4.005011e-12
## 3398.08964180503 6.368315e-14 13.19598 4.005011e-12
## 3407.73360270487 6.391827e-14 13.19437 4.005011e-12
## 3417.37756360471 6.756756e-14 13.17026 4.005011e-12
## 3378.80172000536 6.863320e-14 13.16347 4.005011e-12
## 3427.02152450455 7.623076e-14 13.11787 4.005011e-12
## 3369.15775910552 7.909042e-14 13.10188 4.005011e-12
## 3359.51379820568 9.586882e-14 13.01832 4.005011e-12
## 3436.66548540439 9.637728e-14 13.01603 4.005011e-12
## 3349.86983730584 1.179249e-13 12.92839 4.346994e-12
##                                               tukey
## 3388.4456809052  IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3398.08964180503 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3407.73360270487 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3417.37756360471 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3378.80172000536 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3427.02152450455 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3369.15775910552 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3359.51379820568 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3436.66548540439 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3349.86983730584 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
anova.cassava.ppd = aov.all.vars(ir.cassava.ppd, "ppds")
anova.cassava.ppd[1:10,]
##                       pvalues     logs          fdr         tukey
## 2356.54186462239 2.368101e-06 5.625600 0.0006385371 5-0; 5-3; 8-3
## 2366.18582552223 3.414637e-06 5.466655 0.0006385371 5-0; 5-3; 8-3
## 2337.25394282271 2.264428e-05 4.645042 0.0028229864 3-0; 5-3; 8-3
## 2327.60998192287 9.843466e-05 4.006852 0.0078649116      5-3; 8-3
## 2346.89790372255 1.051459e-04 3.978208 0.0078649116 3-0; 5-3; 8-3
## 2317.96602102303 1.452298e-03 2.837944 0.0905266028      5-3; 8-3
## 2375.82978642207 2.478682e-03 2.605779 0.1277099355      5-3; 8-3
## 967.811495045528 4.003333e-03 2.397578 0.1277099355      8-0; 8-5
## 977.455455945367 4.582083e-03 2.338937 0.1277099355      8-0; 8-5
## 958.167534145688 7.444013e-03 2.128193 0.1277099355      8-0; 8-5

CLUSTERING

Hierarchical clustering with euclidean distance and complete method was performed on the data and the resulting dendrogram is shown below:

hc.cassava = clustering(ir.cassava.ppd, method = "hc", distance = "euclidean")
dendrogram.plot.col(ir.cassava.ppd, hc.cassava, "ppds")

dendrogram.plot.col(ir.cassava.ppd, hc.cassava, "varieties")

K-Means was performed on the data also with 4 centers and the results and the plot giving for each cluster the median of the samples in blue, and in grey the values of all samples in that cluster are shown below:

kmeans.cassava = clustering(ir.cassava.ppd, method = "kmeans", num.clusters = 4)
kmeans.plot(ir.cassava.ppd, kmeans.cassava)

kmeans.df = kmeans.result.df(kmeans.cassava, 4)
kmeans.df
##   cluster
## 1       1
## 2       2
## 3       3
## 4       4
##                                                                                                                                                                                                   samples
## 1                               IAC5_1 IAC5_2 IAC5_3 IAC5_4 IAC5_5 ORI_1 ORI_2 ORI_3 ORI_4 ORI_5 SAN3_1 SAN3_2 SAN3_3 SAN3_4 SAN3_5 SAN5_1 SAN5_2 SAN5_3 SAN5_4 SAN5_5 SAN8_1 SAN8_2 SAN8_3 SAN8_4 SAN8_5
## 2                                                                                                     BRA_1 BRA_2 BRA3_1 BRA3_2 BRA3_3 BRA3_4 BRA3_5 BRA_3 BRA_4 BRA5_1 BRA5_2 BRA5_3 BRA5_4 BRA5_5 BRA_5
## 3 BRA8_1 BRA8_2 BRA8_3 BRA8_4 BRA8_5 IAC_1 IAC_2 IAC3_1 IAC3_2 IAC3_3 IAC3_4 IAC3_5 IAC_3 IAC_4 IAC_5 IAC8_1 IAC8_2 IAC8_3 IAC8_4 IAC8_5 ORI8_1 ORI8_2 ORI8_3 ORI8_4 ORI8_5 SAN_1 SAN_2 SAN_3 SAN_4 SAN_5
## 4                                                                                                                                   ORI3_1 ORI3_2 ORI3_3 ORI3_4 ORI3_5 ORI5_1 ORI5_2 ORI5_3 ORI5_4 ORI5_5

PCA

Principal components analysis was performed on the data and some plots are shown below:

pca.analysis.result = pca.analysis.dataset(ir.cassava.ppd)

pca.pairs.plot(ir.cassava.ppd, pca.analysis.result, "varieties")

pca.pairs.plot(ir.cassava.ppd, pca.analysis.result, "ppds")

pca.screeplot(pca.analysis.result)

pca.scoresplot2D(ir.cassava.ppd, pca.analysis.result, "varieties", ellipses = T)

pca.scoresplot2D(ir.cassava.ppd, pca.analysis.result, "ppds", ellipses = T)

pca.kmeans.plot2D(ir.cassava.ppd, pca.analysis.result, kmeans.result = kmeans.cassava, ellipses = T)

MACHINE LEARNING

For classification models and prediction the following parameters were used: - models: PLS, SVM and Random Forests - validation method: repeated cross-validation - number of folds: 10 - number of repeats: 10

Below are some results with the best tune for each model:

ml.cassava.varieties = train.models.performance(ir.cassava.ppd, c("pls", "svmLinear", "rf"), "varieties", "repeatedcv", num.folds = 10, num.repeats = 10, tunelength = 20, metric = "ROC")
ml.cassava.varieties$performance
##           Accuracy     Kappa Sensitivity Specificity      ROC AccuracySD
## pls        0.99500 0.9933333     0.99500   0.9983333 1.000000 0.03035980
## svmLinear  0.79375 0.7250000     0.79375   0.9312500 0.986875 0.12228749
## rf         0.98625 0.9816667     0.98625   0.9954167 1.000000 0.03930825
##              KappaSD SensitivitySD SpecificitySD      ROCSD
## pls       0.04047973    0.03035980    0.01011993 0.00000000
## svmLinear 0.16304998    0.12228749    0.04076250 0.02148701
## rf        0.05241101    0.03930825    0.01310275 0.00000000
ml.cassava.ppds = train.models.performance(ir.cassava.ppd, c("pls",
"svmLinear", "rf"), "ppds", "repeatedcv", num.folds = 10, num.repeats = 10, tunelength = 20, metric = "ROC")
ml.cassava.ppds$performance
##           Accuracy     Kappa Sensitivity Specificity       ROC AccuracySD
## pls        0.99875 0.9983333     0.99875   0.9995833 1.0000000 0.01250000
## svmLinear  0.98375 0.9783333     0.98375   0.9945833 0.9989583 0.04583333
## rf         0.99875 0.9983333     0.99875   0.9995833 1.0000000 0.01250000
##              KappaSD SensitivitySD SpecificitySD       ROCSD
## pls       0.01666667    0.01250000   0.004166667 0.000000000
## svmLinear 0.06111111    0.04583333   0.015277778 0.005439927
## rf        0.01666667    0.01250000   0.004166667 0.000000000

And the variable importance in varieties and ppds models:

summary.var.importance(ml.cassava.varieties, 10)
## $pls
##                       BRA       IAC       ORI      SAN     Mean
## 967.811495045528 6.278729 100.00000 32.903702 64.82913 51.00289
## 977.455455945367 6.405656  77.04131 24.493274 43.60135 37.88540
## 418.105723754687 7.752132  85.18005 13.171803 37.48059 35.89615
## 958.167534145688 5.782252  68.13453 27.621381 40.44938 35.49689
## 813.508120648099 3.779121  79.01102 14.393620 43.37156 35.13883
## 1160.69071304231 6.261956  83.51381  7.827909 30.33564 31.98483
## 1517.51726633637 3.100334  65.10566 22.022708 33.69507 30.98094
## 852.083964247456 2.190321  75.72957  7.465535 36.12740 30.37821
## 1594.66895353508 2.426069  58.93469 19.968701 33.93216 28.81541
## 2366.18582552223 7.446325  50.29415 27.576949 29.35479 28.66805
## 
## $svmLinear
##                  BRA IAC      ORI SAN     Mean
## 3494.52925080343 100 100 77.87611 100 94.46903
## 3484.88528990359 100 100 77.87611 100 94.46903
## 3475.24132900375 100 100 77.87611 100 94.46903
## 3465.59736810391 100 100 77.87611 100 94.46903
## 3455.95340720407 100 100 77.87611 100 94.46903
## 3446.30944630423 100 100 77.87611 100 94.46903
## 3436.66548540439 100 100 77.87611 100 94.46903
## 3427.02152450455 100 100 77.87611 100 94.46903
## 3417.37756360471 100 100 77.87611 100 94.46903
## 3407.73360270487 100 100 77.87611 100 94.46903
## 
## $rf
##                    Overall      Mean
## 3292.0060719068  100.00000 100.00000
## 3243.78626740761  90.75347  90.75347
## 3407.73360270487  86.59881  86.59881
## 2694.08049611676  86.03400  86.03400
## 2308.32206012319  85.12578  85.12578
## 1556.09310993573  80.77033  80.77033
## 3224.49834560793  79.95294  79.95294
## 2269.74621652384  79.35050  79.35050
## 3436.66548540439  78.49818  78.49818
## 3272.71815010712  77.83540  77.83540
summary.var.importance(ml.cassava.ppds, 10)
## $pls
##                         0        3        5         8     Mean
## 1565.73707083556 31.36856 44.82214 71.43565 100.00000 61.90659
## 1556.09310993573 32.96379 34.97601 62.13161  97.40827 56.86992
## 1575.3810317354  29.53619 39.13105 64.93856  91.82365 56.35736
## 2356.54186462239 37.95626 67.00608 69.86478  48.79889 55.90650
## 2366.18582552223 36.99927 64.24149 67.33058  48.48759 54.26474
## 2337.25394282271 30.06382 52.17491 55.58851  41.45750 44.82119
## 1585.02499263524 27.02948 27.10084 49.76729  75.07546 44.74327
## 1546.44914903589 28.30017 20.24056 43.48790  75.86620 41.97371
## 2327.60998192287 26.44109 45.44907 49.15567  37.86730 39.72828
## 1720.04044523299 28.87206 23.81292 38.54042  61.65402 38.21986
## 
## $svmLinear
##                        X0        X3        X5       X8     Mean
## 2356.54186462239 80.57554 100.00000 100.00000 94.24460 93.70504
## 2366.18582552223 76.97842  99.28058  99.28058 94.24460 92.44604
## 2337.25394282271 57.55396  97.12230  97.12230 94.24460 86.51079
## 2327.60998192287 51.79856  94.96403  94.96403 94.24460 83.99281
## 2346.89790372255 51.79856  94.24460  94.24460 94.24460 83.63309
## 3754.91619509909 91.36691  92.08633  58.99281 92.08633 83.63309
## 3735.62827329941 92.80576  92.08633  55.39568 92.80576 83.27338
## 3764.56015599893 89.92806  91.36691  57.55396 91.36691 82.55396
## 3822.42392139796 83.45324  87.76978  70.50360 87.76978 82.37410
## 3803.13599959829 86.33094  88.48921  66.18705 88.48921 82.37410
## 
## $rf
##                    Overall      Mean
## 3388.4456809052  100.00000 100.00000
## 2356.54186462239  98.18492  98.18492
## 3292.0060719068   95.27550  95.27550
## 987.099416845206  95.05345  95.05345
## 524.189293652919  92.83034  92.83034
## 3639.18866430102  90.71171  90.71171
## 1122.11486944296  89.65394  89.65394
## 967.811495045528  89.31905  89.31905
## 3369.15775910552  89.13645  89.13645
## 456.681567354044  88.62628  88.62628

FEATURE SELECTION

Using recursive feature selection, various subsets were used with random forests classifier. The results are shown below:

feature.selection.result.varieties = feature.selection(ir.cassava.ppd, "varieties", method="rfe", functions = rfFuncs, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result.varieties
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          2   0.9450 0.9267    0.08806 0.11742         
##          4   0.9625 0.9500    0.07254 0.09671         
##          8   0.9800 0.9733    0.05847 0.07796         
##         16   0.9725 0.9633    0.06334 0.08445         
##         32   0.9725 0.9633    0.06334 0.08445         
##         64   0.9825 0.9767    0.05653 0.07537        *
##        374   0.9750 0.9667    0.06186 0.08248         
## 
## The top 5 variables (out of 64):
##    X1064.25110404392, X1633.24479713444, X3214.85438470809, X1642.88875803428, X1604.31291443492
plot(feature.selection.result.varieties, type=c("g","o"))

feature.selection.result.ppds = feature.selection(ir.cassava.ppd, "ppds", method="rfe", functions = rfFuncs, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result.ppds
## 
## Recursive feature selection
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance over subset size:
## 
##  Variables Accuracy  Kappa AccuracySD KappaSD Selected
##          2   0.7225 0.6300    0.22058 0.29410         
##          4   0.8625 0.8167    0.15619 0.20825         
##          8   0.9575 0.9433    0.06967 0.09289         
##         16   0.9625 0.9500    0.07681 0.10241         
##         32   0.9700 0.9600    0.06944 0.09258         
##         64   0.9850 0.9800    0.05997 0.07997        *
##        374   0.9850 0.9800    0.05997 0.07997         
## 
## The top 5 variables (out of 64):
##    X2366.18582552223, X2356.54186462239, X2337.25394282271, X2327.60998192287, X2346.89790372255
plot(feature.selection.result.ppds, type=c("g","o"))

Also selection by filter was used with the results shown below:

feature.selection.result2.varieties = feature.selection(ir.cassava.ppd, "varieties", method="filter", functions = rfSBF, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result2.varieties
## 
## Selection By Filter
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance:
## 
##  Accuracy  Kappa AccuracySD KappaSD
##    0.9725 0.9633    0.05808 0.07745
## 
## Using the training set, 374 variables were selected:
##    X3996.01521759507, X3986.37125669523, X3976.72729579539, X3967.08333489555, X3957.43937399571...
## 
## During resampling, the top 5 selected variables (out of a possible 374):
##    X1006.38733864488 (100%), X1016.03129954472 (100%), X1025.67526044456 (100%), X1035.3192213444 (100%), X1044.96318224424 (100%)
## 
## On average, 374 variables were selected (min = 374, max = 374)
feature.selection.result2.ppds = feature.selection(ir.cassava.ppd, "ppds", method="filter", functions = rfSBF, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result2.ppds
## 
## Selection By Filter
## 
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times) 
## 
## Resampling performance:
## 
##  Accuracy  Kappa AccuracySD KappaSD
##      0.98 0.9733    0.04629 0.06172
## 
## Using the training set, 143 variables were selected:
##    X3899.57560859668, X3870.64372589716, X3860.99976499732, X3851.35580409748, X3841.71184319764...
## 
## During resampling, the top 5 selected variables (out of a possible 309):
##    X2317.96602102303 (100%), X2327.60998192287 (100%), X2337.25394282271 (100%), X2346.89790372255 (100%), X2356.54186462239 (100%)
## 
## On average, 102.6 variables were selected (min = 13, max = 296)