#set the correct path where the data are stored
setwd("~/tmpBio/")

#reading input data from bracelet analysis and  questionnaire 
PAHCompounds= read.table(file="./PAHCompounds", sep = "\t", header = TRUE, row.names = 1)
SamplesFeatures= read.table(file="./SamplesFeatures", sep = "\t", header = TRUE, row.names = 1)

SamplesFeatures
PAHCompounds

SamplesFeatures[c(2,1),]
SamplesFeatures[c("WB2","WB1"),]

#making in the same order the rows
SamplesFeaturesSort = SamplesFeatures[(rownames(PAHCompounds)),]

SamplesFeaturesSort

#inspection of the data
sumsamples=summary(SamplesFeatures)
sumsamples

#do the same for PAHCompounds

#creating data for pie plot
mytable <- table(SamplesFeatures$Sex)
lbls <- paste(names(mytable), "\n", mytable, sep="")
pie(mytable, labels = lbls, main="Pie Chart of Sex distribution \n (with sample sizes)")

#do the same for other columns

#visualization of the data
Fluorene <-PAHCompounds$Fluorene
Phenantrene <- PAHCompounds$Phenantrene
Fluoranthene <- PAHCompounds$Fluoranthene
Pyrene <- PAHCompounds$Pyrene

boxplot(Fluorene,Phenantrene,Fluoranthene,Pyrene, main = "PAH compounds",  names = c("Fluorene", "Phenantrene", "Fluoranthene", "Pyrene"),col = c("orange","green", "blue","red"))


#test of data distribution

#Kolmogorov-Smirnov test
ks.test(PAHCompounds$Fluorene, "pnorm",mean=mean(PAHCompounds$Fluorene),sd=sd(PAHCompounds$Fluorene))

#D is the value of the K-S test statistic. It means the maximum difference between the x & y probability mass function  is D. Not that important.

#p-value is the important number. The smaller this number is, the less likely that x=y is true. (More technically, it is the probability of finding a situation more extreme than what you have in your data, assuming that x=y)

#alternative hypothesis=two-sided. The alternative hypothesis is denoted H1. A one-sided alternative hypothesis, for example, would be x<y. The two-sided alt. hypothesis you have, is x != y.


#Anderson-Darling
install.packages("nortest")
library(nortest)
ad.test(PAHCompounds$Fluorene)
#A is the value of the Anderson-Darling statistic. 

#Kruskal-Wallis test to check the significant considering a NOT normal data distribution
kw_fluo <- kruskal.test(PAHCompounds$Fluorene ~ SamplesFeatures$Sex, data = PAHCompounds)
kw_fluo

#H0 that Flourene distributions are identical in the populations.


#ANOVA test to check the significant considering a normal data distribution
aov_fluo <- aov(PAHCompounds$Fluorene ~ SamplesFeatures$Sex, data = PAHCompounds)
summary(aov_fluo)


heatmap(as.matrix(PAHCompounds))