The data is read from .dx files and converted to a dataset list. Also “ppds”, which is numeric, was converted to factor.
setwd("~/Dropbox")
library(metabolomicsUM)
source("Datasets/CassavaPPD/IR/scripts/ir-cassavappd-metadata.R")
ir.cassppd.data.folder = "Datasets/CassavaPPD/IR/data"
ir.cassppd.metadata.file = "Datasets/CassavaPPD/IR/metadata/ir-cassavappd-metadata.csv"
get.metadata.ir.cassavappd(ir.cassppd.data.folder, ir.cassppd.metadata.file)
ir.cassava.ppd = read.dataset.dx(ir.cassppd.data.folder, ir.cassppd.metadata.file,
type = "ir-spectra", description = "IR for Cassava - PPD",
label.x = "Wavenumber", label.values = "transmittance")
ir.cassava.ppd = convert.to.factor(ir.cassava.ppd, "ppds")
To reduce the number of features in order to make the analysis’ algorithms run faster, smoothing interpolation was used.
sum.dataset(ir.cassava.ppd)
## Dataset summary:
## Valid dataset
## Description: IR for Cassava - PPD
## Type of data: ir-spectra
## Number of samples: 80
## Number of data points 3735
## Number of metadata variables: 3
## Label of x-axis values: Wavenumber
## Label of data points: transmittance
## Number of missing values in data: 0
## Mean of data values: 0.5942576
## Median of data values: 0.61493
## Standard deviation: 0.1519456
## Range of values: 0.11203 0.83064
## Quantiles:
## 0% 25% 50% 75% 100%
## 0.11203 0.48967 0.61493 0.72258 0.83064
ir.cassava.ppd = smoothing.interpolation(ir.cassava.ppd, "bin", 10)
## Loading required package: hyperSpec
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mvtnorm
## Package hyperSpec, version 0.98-20140523
##
## To get started, try
## vignette ("introduction", package = "hyperSpec")
## package?hyperSpec
## vignette (package = "hyperSpec")
##
## If you use this package please cite it appropriately.
## citation("hyperSpec")
## will give you the correct reference.
##
## The project homepage is http://hyperspec.r-forge.r-project.org
## Warning in spc.bin(hyper.object, reducing.factor, na.rm = TRUE): Last data
## point averages only 5 points.
sum.dataset(ir.cassava.ppd)
## Dataset summary:
## Valid dataset
## Description: IR for Cassava - PPD-smoothed with hyperSpec spc.bin
## Type of data: undefined
## Number of samples: 80
## Number of data points 374
## Number of metadata variables: 3
## Label of x-axis values: Wavenumber
## Label of data points: transmittance
## Number of missing values in data: 0
## Mean of data values: 0.5942012
## Median of data values: 0.614718
## Standard deviation: 0.1518141
## Range of values: 0.113363 0.829931
## Quantiles:
## 0% 25% 50% 75% 100%
## 0.1133630 0.4898193 0.6147180 0.7223875 0.8299310
No preprocessing used.
UNIVARIATE ANALYSIS
An analysis of variance (ANOVA) was conducted over the data with tukey test also, and this is the top 10 results ordered by p-value:
anova.cassava.varieties = aov.all.vars(ir.cassava.ppd, "varieties")
anova.cassava.varieties[1:10,]
## pvalues logs fdr
## 3388.4456809052 6.270647e-14 13.20269 4.005011e-12
## 3398.08964180503 6.368315e-14 13.19598 4.005011e-12
## 3407.73360270487 6.391827e-14 13.19437 4.005011e-12
## 3417.37756360471 6.756756e-14 13.17026 4.005011e-12
## 3378.80172000536 6.863320e-14 13.16347 4.005011e-12
## 3427.02152450455 7.623076e-14 13.11787 4.005011e-12
## 3369.15775910552 7.909042e-14 13.10188 4.005011e-12
## 3359.51379820568 9.586882e-14 13.01832 4.005011e-12
## 3436.66548540439 9.637728e-14 13.01603 4.005011e-12
## 3349.86983730584 1.179249e-13 12.92839 4.346994e-12
## tukey
## 3388.4456809052 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3398.08964180503 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3407.73360270487 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3417.37756360471 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3378.80172000536 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3427.02152450455 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3369.15775910552 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3359.51379820568 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3436.66548540439 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
## 3349.86983730584 IAC-BRA; SAN-BRA; ORI-IAC; SAN-ORI
anova.cassava.ppd = aov.all.vars(ir.cassava.ppd, "ppds")
anova.cassava.ppd[1:10,]
## pvalues logs fdr tukey
## 2356.54186462239 2.368101e-06 5.625600 0.0006385371 5-0; 5-3; 8-3
## 2366.18582552223 3.414637e-06 5.466655 0.0006385371 5-0; 5-3; 8-3
## 2337.25394282271 2.264428e-05 4.645042 0.0028229864 3-0; 5-3; 8-3
## 2327.60998192287 9.843466e-05 4.006852 0.0078649116 5-3; 8-3
## 2346.89790372255 1.051459e-04 3.978208 0.0078649116 3-0; 5-3; 8-3
## 2317.96602102303 1.452298e-03 2.837944 0.0905266028 5-3; 8-3
## 2375.82978642207 2.478682e-03 2.605779 0.1277099355 5-3; 8-3
## 967.811495045528 4.003333e-03 2.397578 0.1277099355 8-0; 8-5
## 977.455455945367 4.582083e-03 2.338937 0.1277099355 8-0; 8-5
## 958.167534145688 7.444013e-03 2.128193 0.1277099355 8-0; 8-5
CLUSTERING
Hierarchical clustering with euclidean distance and complete method was performed on the data and the resulting dendrogram is shown below:
hc.cassava = clustering(ir.cassava.ppd, method = "hc", distance = "euclidean")
dendrogram.plot.col(ir.cassava.ppd, hc.cassava, "ppds")
dendrogram.plot.col(ir.cassava.ppd, hc.cassava, "varieties")
K-Means was performed on the data also with 4 centers and the results and the plot giving for each cluster the median of the samples in blue, and in grey the values of all samples in that cluster are shown below:
kmeans.cassava = clustering(ir.cassava.ppd, method = "kmeans", num.clusters = 4)
kmeans.plot(ir.cassava.ppd, kmeans.cassava)
kmeans.df = kmeans.result.df(kmeans.cassava, 4)
kmeans.df
## cluster
## 1 1
## 2 2
## 3 3
## 4 4
## samples
## 1 IAC5_1 IAC5_2 IAC5_3 IAC5_4 IAC5_5 ORI_1 ORI_2 ORI_3 ORI_4 ORI_5 SAN3_1 SAN3_2 SAN3_3 SAN3_4 SAN3_5 SAN5_1 SAN5_2 SAN5_3 SAN5_4 SAN5_5 SAN8_1 SAN8_2 SAN8_3 SAN8_4 SAN8_5
## 2 BRA_1 BRA_2 BRA3_1 BRA3_2 BRA3_3 BRA3_4 BRA3_5 BRA_3 BRA_4 BRA5_1 BRA5_2 BRA5_3 BRA5_4 BRA5_5 BRA_5
## 3 BRA8_1 BRA8_2 BRA8_3 BRA8_4 BRA8_5 IAC_1 IAC_2 IAC3_1 IAC3_2 IAC3_3 IAC3_4 IAC3_5 IAC_3 IAC_4 IAC_5 IAC8_1 IAC8_2 IAC8_3 IAC8_4 IAC8_5 ORI8_1 ORI8_2 ORI8_3 ORI8_4 ORI8_5 SAN_1 SAN_2 SAN_3 SAN_4 SAN_5
## 4 ORI3_1 ORI3_2 ORI3_3 ORI3_4 ORI3_5 ORI5_1 ORI5_2 ORI5_3 ORI5_4 ORI5_5
PCA
Principal components analysis was performed on the data and some plots are shown below:
pca.analysis.result = pca.analysis.dataset(ir.cassava.ppd)
pca.pairs.plot(ir.cassava.ppd, pca.analysis.result, "varieties")
pca.pairs.plot(ir.cassava.ppd, pca.analysis.result, "ppds")
pca.screeplot(pca.analysis.result)
pca.scoresplot2D(ir.cassava.ppd, pca.analysis.result, "varieties", ellipses = T)
pca.scoresplot2D(ir.cassava.ppd, pca.analysis.result, "ppds", ellipses = T)
pca.kmeans.plot2D(ir.cassava.ppd, pca.analysis.result, kmeans.result = kmeans.cassava, ellipses = T)
MACHINE LEARNING
For classification models and prediction the following parameters were used: - models: PLS, SVM and Random Forests - validation method: repeated cross-validation - number of folds: 10 - number of repeats: 10
Below are some results with the best tune for each model:
ml.cassava.varieties = train.models.performance(ir.cassava.ppd, c("pls", "svmLinear", "rf"), "varieties", "repeatedcv", num.folds = 10, num.repeats = 10, tunelength = 20, metric = "ROC")
ml.cassava.varieties$performance
## Accuracy Kappa Sensitivity Specificity ROC AccuracySD
## pls 0.99500 0.9933333 0.99500 0.9983333 1.000000 0.03035980
## svmLinear 0.79375 0.7250000 0.79375 0.9312500 0.986875 0.12228749
## rf 0.98625 0.9816667 0.98625 0.9954167 1.000000 0.03930825
## KappaSD SensitivitySD SpecificitySD ROCSD
## pls 0.04047973 0.03035980 0.01011993 0.00000000
## svmLinear 0.16304998 0.12228749 0.04076250 0.02148701
## rf 0.05241101 0.03930825 0.01310275 0.00000000
ml.cassava.ppds = train.models.performance(ir.cassava.ppd, c("pls",
"svmLinear", "rf"), "ppds", "repeatedcv", num.folds = 10, num.repeats = 10, tunelength = 20, metric = "ROC")
ml.cassava.ppds$performance
## Accuracy Kappa Sensitivity Specificity ROC AccuracySD
## pls 0.99875 0.9983333 0.99875 0.9995833 1.0000000 0.01250000
## svmLinear 0.98375 0.9783333 0.98375 0.9945833 0.9989583 0.04583333
## rf 0.99875 0.9983333 0.99875 0.9995833 1.0000000 0.01250000
## KappaSD SensitivitySD SpecificitySD ROCSD
## pls 0.01666667 0.01250000 0.004166667 0.000000000
## svmLinear 0.06111111 0.04583333 0.015277778 0.005439927
## rf 0.01666667 0.01250000 0.004166667 0.000000000
And the variable importance in varieties and ppds models:
summary.var.importance(ml.cassava.varieties, 10)
## $pls
## BRA IAC ORI SAN Mean
## 967.811495045528 6.278729 100.00000 32.903702 64.82913 51.00289
## 977.455455945367 6.405656 77.04131 24.493274 43.60135 37.88540
## 418.105723754687 7.752132 85.18005 13.171803 37.48059 35.89615
## 958.167534145688 5.782252 68.13453 27.621381 40.44938 35.49689
## 813.508120648099 3.779121 79.01102 14.393620 43.37156 35.13883
## 1160.69071304231 6.261956 83.51381 7.827909 30.33564 31.98483
## 1517.51726633637 3.100334 65.10566 22.022708 33.69507 30.98094
## 852.083964247456 2.190321 75.72957 7.465535 36.12740 30.37821
## 1594.66895353508 2.426069 58.93469 19.968701 33.93216 28.81541
## 2366.18582552223 7.446325 50.29415 27.576949 29.35479 28.66805
##
## $svmLinear
## BRA IAC ORI SAN Mean
## 3494.52925080343 100 100 77.87611 100 94.46903
## 3484.88528990359 100 100 77.87611 100 94.46903
## 3475.24132900375 100 100 77.87611 100 94.46903
## 3465.59736810391 100 100 77.87611 100 94.46903
## 3455.95340720407 100 100 77.87611 100 94.46903
## 3446.30944630423 100 100 77.87611 100 94.46903
## 3436.66548540439 100 100 77.87611 100 94.46903
## 3427.02152450455 100 100 77.87611 100 94.46903
## 3417.37756360471 100 100 77.87611 100 94.46903
## 3407.73360270487 100 100 77.87611 100 94.46903
##
## $rf
## Overall Mean
## 3292.0060719068 100.00000 100.00000
## 3243.78626740761 90.75347 90.75347
## 3407.73360270487 86.59881 86.59881
## 2694.08049611676 86.03400 86.03400
## 2308.32206012319 85.12578 85.12578
## 1556.09310993573 80.77033 80.77033
## 3224.49834560793 79.95294 79.95294
## 2269.74621652384 79.35050 79.35050
## 3436.66548540439 78.49818 78.49818
## 3272.71815010712 77.83540 77.83540
summary.var.importance(ml.cassava.ppds, 10)
## $pls
## 0 3 5 8 Mean
## 1565.73707083556 31.36856 44.82214 71.43565 100.00000 61.90659
## 1556.09310993573 32.96379 34.97601 62.13161 97.40827 56.86992
## 1575.3810317354 29.53619 39.13105 64.93856 91.82365 56.35736
## 2356.54186462239 37.95626 67.00608 69.86478 48.79889 55.90650
## 2366.18582552223 36.99927 64.24149 67.33058 48.48759 54.26474
## 2337.25394282271 30.06382 52.17491 55.58851 41.45750 44.82119
## 1585.02499263524 27.02948 27.10084 49.76729 75.07546 44.74327
## 1546.44914903589 28.30017 20.24056 43.48790 75.86620 41.97371
## 2327.60998192287 26.44109 45.44907 49.15567 37.86730 39.72828
## 1720.04044523299 28.87206 23.81292 38.54042 61.65402 38.21986
##
## $svmLinear
## X0 X3 X5 X8 Mean
## 2356.54186462239 80.57554 100.00000 100.00000 94.24460 93.70504
## 2366.18582552223 76.97842 99.28058 99.28058 94.24460 92.44604
## 2337.25394282271 57.55396 97.12230 97.12230 94.24460 86.51079
## 2327.60998192287 51.79856 94.96403 94.96403 94.24460 83.99281
## 2346.89790372255 51.79856 94.24460 94.24460 94.24460 83.63309
## 3754.91619509909 91.36691 92.08633 58.99281 92.08633 83.63309
## 3735.62827329941 92.80576 92.08633 55.39568 92.80576 83.27338
## 3764.56015599893 89.92806 91.36691 57.55396 91.36691 82.55396
## 3822.42392139796 83.45324 87.76978 70.50360 87.76978 82.37410
## 3803.13599959829 86.33094 88.48921 66.18705 88.48921 82.37410
##
## $rf
## Overall Mean
## 3388.4456809052 100.00000 100.00000
## 2356.54186462239 98.18492 98.18492
## 3292.0060719068 95.27550 95.27550
## 987.099416845206 95.05345 95.05345
## 524.189293652919 92.83034 92.83034
## 3639.18866430102 90.71171 90.71171
## 1122.11486944296 89.65394 89.65394
## 967.811495045528 89.31905 89.31905
## 3369.15775910552 89.13645 89.13645
## 456.681567354044 88.62628 88.62628
FEATURE SELECTION
Using recursive feature selection, various subsets were used with random forests classifier. The results are shown below:
feature.selection.result.varieties = feature.selection(ir.cassava.ppd, "varieties", method="rfe", functions = rfFuncs, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result.varieties
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.9450 0.9267 0.08806 0.11742
## 4 0.9625 0.9500 0.07254 0.09671
## 8 0.9800 0.9733 0.05847 0.07796
## 16 0.9725 0.9633 0.06334 0.08445
## 32 0.9725 0.9633 0.06334 0.08445
## 64 0.9825 0.9767 0.05653 0.07537 *
## 374 0.9750 0.9667 0.06186 0.08248
##
## The top 5 variables (out of 64):
## X1064.25110404392, X1633.24479713444, X3214.85438470809, X1642.88875803428, X1604.31291443492
plot(feature.selection.result.varieties, type=c("g","o"))
feature.selection.result.ppds = feature.selection(ir.cassava.ppd, "ppds", method="rfe", functions = rfFuncs, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result.ppds
##
## Recursive feature selection
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance over subset size:
##
## Variables Accuracy Kappa AccuracySD KappaSD Selected
## 2 0.7225 0.6300 0.22058 0.29410
## 4 0.8625 0.8167 0.15619 0.20825
## 8 0.9575 0.9433 0.06967 0.09289
## 16 0.9625 0.9500 0.07681 0.10241
## 32 0.9700 0.9600 0.06944 0.09258
## 64 0.9850 0.9800 0.05997 0.07997 *
## 374 0.9850 0.9800 0.05997 0.07997
##
## The top 5 variables (out of 64):
## X2366.18582552223, X2356.54186462239, X2337.25394282271, X2327.60998192287, X2346.89790372255
plot(feature.selection.result.ppds, type=c("g","o"))
Also selection by filter was used with the results shown below:
feature.selection.result2.varieties = feature.selection(ir.cassava.ppd, "varieties", method="filter", functions = rfSBF, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result2.varieties
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance:
##
## Accuracy Kappa AccuracySD KappaSD
## 0.9725 0.9633 0.05808 0.07745
##
## Using the training set, 374 variables were selected:
## X3996.01521759507, X3986.37125669523, X3976.72729579539, X3967.08333489555, X3957.43937399571...
##
## During resampling, the top 5 selected variables (out of a possible 374):
## X1006.38733864488 (100%), X1016.03129954472 (100%), X1025.67526044456 (100%), X1035.3192213444 (100%), X1044.96318224424 (100%)
##
## On average, 374 variables were selected (min = 374, max = 374)
feature.selection.result2.ppds = feature.selection(ir.cassava.ppd, "ppds", method="filter", functions = rfSBF, validation = "repeatedcv", repeats = 5, subsets = 2^(1:6))
feature.selection.result2.ppds
##
## Selection By Filter
##
## Outer resampling method: Cross-Validated (10 fold, repeated 5 times)
##
## Resampling performance:
##
## Accuracy Kappa AccuracySD KappaSD
## 0.98 0.9733 0.04629 0.06172
##
## Using the training set, 143 variables were selected:
## X3899.57560859668, X3870.64372589716, X3860.99976499732, X3851.35580409748, X3841.71184319764...
##
## During resampling, the top 5 selected variables (out of a possible 309):
## X2317.96602102303 (100%), X2327.60998192287 (100%), X2337.25394282271 (100%), X2346.89790372255 (100%), X2356.54186462239 (100%)
##
## On average, 102.6 variables were selected (min = 13, max = 296)