Basic Statistics.ppt_第1页
Basic Statistics.ppt_第2页
Basic Statistics.ppt_第3页
Basic Statistics.ppt_第4页
Basic Statistics.ppt_第5页
已阅读5页,还剩84页未读 继续免费阅读

下载本文档

版权说明:本文档由用户提供并上传,收益归属内容提供方,若内容存在侵权,请进行举报或认领

文档简介

BasicStatistics,Content,DescriptiveStatisticsFrequenciesy-x#createsomedatapar(pch=22,col=red)#plottingsymbolandcolorpar(mfrow=c(2,4)#allplotsononepageopts=c(p,l,o,b,c,s,S,h)for(iin1:length(opts)heading=paste(type=,optsi)plot(x,y,type=n,main=heading)lines(x,y,type=optsi),Next,wedemonstrateeachofthetype=optionswhenplot()setsupthegraphanddoesplotthepoints.x-c(1:5);y-x#createsomedatapar(pch=22,col=blue)#plottingsymbolandcolorpar(mfrow=c(2,4)#allplotsononepageopts=c(p,l,o,b,c,s,S,h)for(iin1:length(opts)heading=paste(type=,optsi)plot(x,y,main=heading)lines(x,y,type=optsi)Asyoucansee,thetype=coptiononlylooksdifferentfromthetype=boptioniftheplottingofpointsissuppressedintheplot()command.,Todemonstratethecreationofamorecomplexlinechart,letsplotthegrowthof5orangetreesovertime.Eachtreewillhaveitsowndistinctiveline.ThedatacomefromthedatasetOrange.#CreateLineChart#convertfactortonumericforconvenienceOrange$Tree-as.numeric(Orange$Tree)ntrees-max(Orange$Tree)#gettherangeforthexandyaxisxrange-range(Orange$age)yrange-range(Orange$circumference)#setuptheplotplot(xrange,yrange,type=n,xlab=Age(days),ylab=Circumference(mm)colors-rainbow(ntrees)linetype-c(1:ntrees)plotchar-seq(18,18+ntrees,1),#addlinesfor(iin1:ntrees)tree-subset(Orange,Tree=i)lines(tree$age,tree$circumference,type=b,lwd=1.5,lty=linetypei,col=colorsi,pch=plotchari)#addatitleandsubtitletitle(TreeGrowth,exampleoflineplot)#addalegendlegend(xrange1,yrange2,1:ntrees,cex=0.8,col=colors,pch=plotchar,lty=linetype,title=Tree),PieCharts,PiechartsarenotrecommendedintheRdocumentation,andtheirfeaturesaresomewhatlimited.Theauthorsrecommendbarordotplotsoverpiechartsbecausepeopleareabletojudgelengthmoreaccuratelythanvolume.Piechartsarecreatedwiththefunctionpie(x,labels=)wherexisanon-negativenumericvectorindicatingtheareaofeachsliceandlabels=notesacharactervectorofnamesfortheslices.,SimplePieChart#SimplePieChartslices-c(10,12,4,16,8)lbls-c(US,UK,Australia,Germany,France)pie(slices,labels=lbls,main=PieChartofCountries)PieChartwithAnnotatedPercentages#PieChartwithPercentagesslices-c(10,12,4,16,8)lbls-c(US,UK,Australia,Germany,France)pct-round(slices/sum(slices)*100)lbls-paste(lbls,pct)#addpercentstolabelslbls-paste(lbls,%,sep=)#ad%tolabelspie(slices,labels=lbls,col=rainbow(length(lbls),main=PieChartofCountries),3DPieChartThepie3D()functionintheplotrixpackageprovides3Dexplodedpiecharts.#3DExplodedPieChartlibrary(plotrix)slices-c(10,12,4,16,8)lbls-c(US,UK,Australia,Germany,France)pie3D(slices,labels=lbls,explode=0.1,main=PieChartofCountries)CreatingAnnotatedPiesfromadataframe#PieChartfromdataframewithAppendedSampleSizesmytable-table(iris$Species)lbls-paste(names(mytable),n,mytable,sep=)pie(mytable,labels=lbls,main=PieChartofSpeciesn(withsamplesizes),Boxplots,Boxplotscanbecreatedforindividualvariablesorforvariablesbygroup.Theformatisboxplot(x,data=),wherexisaformulaanddata=denotesthedataframeprovidingthedata.Anexampleofaformulaisygroupwhereaseparateboxplotfornumericvariableyisgeneratedforeachvalueofgroup.Addvarwidth=TRUEtomakeboxplotwidthsproportionaltothesquarerootofthesamplessizes.Addhorizontal=TRUEtoreversetheaxisorientation.#BoxplotofMPGbyCarCylindersboxplot(mpgcyl,data=mtcars,main=CarMilageData,xlab=NumberofCylinders,ylab=MilesPerGallon),#NotchedBoxplotofToothGrowthAgainst2CrossedFactors#boxescoloredforeaseofinterpretationboxplot(lensupp*dose,data=ToothGrowth,notch=TRUE,col=(c(gold,darkgreen),main=ToothGrowth,xlab=SupplimentandDose)Inthenotchedboxplot,iftwoboxesnotchesdonotoverlapthisisstrongevidencetheirmediansdiffer(Chambersetal.,1983,p.62).Colorsrecycle.Intheexampleabove,ifIhadlisted6colors,eachboxwouldhaveitsowncolor.EarlF.GlynnhascreatedaneasytouselistofcolorsisPDFformat.,OtherOptionsTheboxplot.matrix()functioninthesfsmiscpackagedrawsaboxplotforeachcolumn(row)inamatrix.Theboxplot.n()functioninthegplotspackageannotateseachboxplotwithitssamplesize.Thebplot()functionintheRlabpackageoffersmanymoreoptionscontrollingthepositioningandlabelingofboxesintheoutput.ViolinPlotsAviolinplotisacombinationofaboxplotandakerneldensityplot.Theycanbecreatedusingthevioplot()functionfromvioplotpackage.#ViolinPlotslibrary(vioplot)x1-mtcars$mpgmtcars$cyl=4x2-mtcars$mpgmtcars$cyl=6x3-mtcars$mpgmtcars$cyl=8vioplot(x1,x2,x3,names=c(4cyl,6cyl,8cyl),col=gold)title(ViolinPlotsofMilesPerGallon),Bagplot-A2DBoxplotExtensionThebagplot(x,y)functionintheaplpackpackageprovidesabivariateversionoftheunivariateboxplot.Thebagcontains50%ofallpoints.Thebivariatemedianisapproximated.Thefenceseparatespointsinthefencefrompointsoutside.Outliersaredisplayed.#ExampleofaBagplotlibrary(aplpack)attach(mtcars)bagplot(wt,mpg,xlab=CarWeight,ylab=MilesPerGallon,main=BagplotExample),Scatterplots,SimpleScatterplotTherearemanywaystocreateascatterplotinR.Thebasicfunctionisplot(x,y),wherexandyarenumericvectorsdenotingthe(x,y)pointstoplot.#SimpleScatterplotattach(mtcars)plot(wt,mpg,main=ScatterplotExample,xlab=CarWeight,ylab=MilesPerGallon,pch=19)#Addfitlinesabline(lm(mpgwt),col=red)#regressionline(yx)lines(lowess(wt,mpg),col=blue)#lowessline(x,y),Thescatterplot()functioninthecarpackageoffersmanyenhancedfeatures,includingfitlines,marginalboxplots,conditioningonafactor,andinteractivepointidentification.Eachofthesefeaturesisoptional.#EnhancedScatterplotofMPGvs.Weight#byNumberofCarCylinderslibrary(car)scatterplot(mpgwt|cyl,data=mtcars,xlab=WeightofCar,ylab=MilesPerGallon,main=EnhancedScatterPlot,labels=s(mtcars),ScatterplotMatricesThereareatleast4usefulfunctionsforcreatingscatterplotmatrices.Analystsmustlovescatterplotmatrices!#BasicScatterplotMatrixpairs(mpg+disp+drat+wt,data=mtcars,main=SimpleScatterplotMatrix),Thelatticepackageprovidesoptionstoconditionthescatterplotmatrixonafactor.#ScatterplotMatricesfromthelatticePackagelibrary(lattice)splom(mtcarsc(1,3,5,6),groups=cyl,data=mtcars,panel=panel.superpose,key=list(title=ThreeCylinderOptions,columns=3,points=list(pch=super.sym$pch1:3,col=super.sym$col1:3),text=list(c(4Cylinder,6Cylinder,8Cylinder),Thecarpackagecanconditionthescatterplotmatrixonafactor,andoptionallyincludelowessandlinearbestfitlines,andboxplot,densities,orhistogramsintheprincipaldiagonal,aswellasrugplotsinthemarginsofthecells.#ScatterplotMatricesfromthecarPackagelibrary(car)scatterplot.matrix(mpg+disp+drat+wt|cyl,data=mtcars,main=ThreeCylinderOptions),Thegcluspackageprovidesoptionstorearrangethevariablessothatthosewithhighercorrelationsareclosertotheprincipaldiagonal.Itcanalsocolorcodethecellstoreflectthesizeofthecorrelations.#ScatterplotMatricesfromtheglusPackagelibrary(gclus)dta-mtcarsc(1,3,5,6)#getdatadta.r-abs(cor(dta)#getcorrelationsdta.col-dmat.color(dta.r)#getcolors#reordervariablessothosewithhighestcorrelation#areclosesttothediagonaldta.o-order.single(dta.r)cpairs(dta,dta.o,panel.colors=dta.col,gap=.5,main=VariablesOrderedandColoredbyCorrelation),HighDensityScatterplotsWhentherearemanydatapointsandsignificantoverlap,scatterplotsbecomelessuseful.Thereareseveralapproachesthatbeusedwhenthisoccurs.Thehexbin(x,y)functioninthehexbinpackageprovidesbivariatebinningintohexagonalcells(itlooksbetterthanitsounds).#HighDensityScatterplotwithBinninglibrary(hexbin)x-rnorm(1000)y-rnorm(1000)bin-hexbin(x,y,xbins=50)plot(bin,main=HexagonalBinning),Anotheroptionforascatterplotwithsignificantpointoverlapisthesunflowerplot.Seehelp(sunflowerplot)fordetails.Finally,youcansavethescatterplotinPDFformatandusecolortransparencytoallowpointsthatoverlaptoshowthrough(thisideacomesfromB.S.EverritinHSAUR).#HighDensityScatterplotwithColorTransparencypdf(c:/scatterplot.pdf)x-rnorm(1000)y-rnorm(1000)plot(x,y,main=PDFScatterplotExample,col=rgb(0,100,0,50,maxColorValue=255),pch=16)dev.off()Note:Youcanusethecol2rgb()functiontogettherbgvaluesforRcolors.Forexample,col2rgb(darkgreen)yeildsr=0,g=100,b=0.Thenaddthealphatransparencylevelasthe4thnumberinthecolorvector.Avalueofzeromeansfullytransparent.Seehelp(rgb)formoreinformation.,3DScatterplotsYoucancreatea3Dscatterplotwiththescatterplot3dpackage.Usethefunctionscatterplot3d(x,y,z).#3DScatterplotlibrary(scatterplot3d)attach(mtcars)scatterplot3d(wt,disp,mpg,main=3DScatterplot)#3DScatterplotwithColoringandVerticalDropLineslibrary(scatterplot3d)attach(mtcars)scatterplot3d(wt,disp,mpg,pch=16,highlight.3d=TRUE,type=h,main=3DScatterplot)#3DScatterplotwithColoringandVerticalLines#andRegressionPlanelibrary(scatterplot3d)attach(mtcars)s3d-scatterplot3d(wt,disp,mpg,pch=16,highlight.3d=TRUE,type=h,main=3DScatterplot)fit-lm(mpgwt+disp)s3d$plane3d(fit),Spinning3DScatterplotsYoucanalsocreateaninteractive3Dscatterplotusingtheplot3D(x,y,z)functionintherglpackage.Itcreatesaspinning3Dscatterplotthatcanberotatedwiththemouse.Thefirstthreeargumentsarethex,y,andznumericvectorsrepresentingpoints.col=andsize=controlthecolorandsizeofthepointsrespectively.#Spinning3dScatterplotlibrary(rgl)plot3d(wt,disp,mpg,col=red,size=3)Youcanperformasimilarfunctionwiththescatter3d(x,y,z)intheRcmdrpackage.#AnotherSpinning3dScatterplotlibrary(Rcmdr)attach(mtcars)scatter3d(wt,disp,mpg),Correlations,Youcanusethecor()functiontoproducecorrelationsandthecov()functiontoproducescovariances.Asimplifiedformatiscor(x,use=,method=)whereOptionDescriptionxMatrixordataframeuseSpecifiesthehandlingofmissingdata.Optionsareall.obs(assumesnomissingdata-missingdatawillproduceanerror),complete.obs(listwisedeletion),plete.obs(pairwisedeletion)methodSpecifiesthetypeofcorrelation.Optionsarepearson,spearmanorkendall.#Correlations/covariancesamongnumericvariablesin#dataframemtcars.Uselistwisedeletionofmissingdata.cor(mtcars,use=complete.obs,method=kendall)cov(mtcars,use=complete.obs)cov(mtcars,use=complete.obs)Unfortunately,neithercor()orcov()producetestsofsignificance,althoughyoucanusethecor.test()functiontotestasinglecorrelationcoefficient.,Thercorr()functionintheHmiscpackageproducescorrelations/covariancesandsignificancelevelsforpearsonandspearmancorrelations.However,inputmustbeamatrixandpairwisedeletionisused.#Correlationswithsignificancelevelslibrary(Hmisc)rcorr(x,type=pearson)#typecanbepearsonorspearman#mtcarsisadataframercorr(as.matrix(mtcars)Youcanusetheformatcor(X,Y)orrcorr(X,Y)togeneratecorrelationsbetweenthecolumnsofXandthecolumnsofY.x-mtcars1:3y-mtcars4:6cor(x,y),OtherTypesofCorrelations,#polychoriccorrelation#xisacontingencytableofcountslibrary(polycor)polychor(x)#heterogeneouscorrelationsinonematrix#pearson(numeric-numeric),#polyserial(numeric-ordinal),#andpolychoric(ordinal-ordinal)#xisadataframewithorderedfactors#andnumericvariableslibrary(polycor)hetcor(x)#partialcorrelationslibrary(ggm)data(mydata)pcor(c(a,b,x,y,z),var(mydata)#partialcorrbetweenaandbcontrollingforx,y,z,VisualizingCorrelations,Usecorrgram()toplotcorrelograms.Usethepairs()orsplom()tocreatescatterplotmatrices.AgreatexampleofaplottedcorrelationmatrixcanbefoundintheRGraphGallery,library(corrgram)corrgram(mtcars,order=TRUE,lower.panel=panel.shade,upper.panel=panel.pie,text.panel=panel.txt,main=CarMilageDatainPC2/PC1Order),NonparametricTestsofGroupDifferences-wilcox,RprovidesfunctionsforcarryingoutMann-WhitneyU,WilcoxonSignedRank,KruskalWallis,andFriedmantests.#independent2-groupMann-WhitneyUTestwilcox.test(yA)#whereyisnumericandAisAbinaryfactor#independent2-groupMann-WhitneyUTestwilcox.test(y,x)#whereyandxarenumeric#dependent2-groupWilcoxonSignedRankTestwilcox.test(y1,y2,paired=TRUE)#wherey1andy2arenumericyoucanusethealternative=lessoralternative=greateroptiontospecifyaonetailedtest.,Examples,#gc-read.csv(/datasets/glob_clus.csv)gc-read.csv(glob_clus.csv)attach(gc)and-lumgal=M31mwg-lumgal=MWGhist(and)hist(and,breaks=100)hist(and,breaks=10000)hist(and,prob=T,main=)#scaley-axislikeadensityfunctiond-density(and)lines(d,col=2,lty=2,lwd=2)b0)#SameasWilcoxteststat2719*2720/4#ExpectedvalueofstatunderH0,Two-samplenonparametrictests,filter150&RA0&DE90&pmRA-60&pmDE0)sum(rank(c(H,nH)1:92)-92*93/2,Signtest,theta0theta0)pbinom(tstat-1,length(x),1/2,lower.tail=FALSE)pbinom(length(x)-tstat,length(x),1/2)mumu)pbinom(b-1,n,1/2,lower.tail=FALSE)1-pbinom(b-1,n,1/2)pbinom(n-b,n,1/2)thefirstlineheredoesexactlythesameaslinefiveintheexample,butislessaccurateforverysmallP-values.Theseconddoesexactlythesameaslinefiveoftheexamplebecauseofthesymmetryofthebinomialdistributionwithp=1/2.Foralower-tailedtestthefifthlinewouldbereplacedbypbinom(b,n,1/2),K-Stest,x-rnorm(50)y-runif(30)#Doxandycomefromthesamedistribution?ks.test(x,y)x2-rnorm(50,-1)plot(ecdf(x),xlim=range(c(x,x2)plot(ecdf(x2),add=TRUE,lty=dashed)t.test(x,x2,alternative=g)wilcox.test(x,x2,alternative=g)ks.test(x,x2,alternative=l)ks.test(rnorm(100),pnorm)#One-sampleKolmogorov-Smirnovtest,#KruskalWallisTestOneWayAnovabyRankskruskal.test(yA)#wherey1isnumericandAisafactor#RandomizedBlockDesign-FriedmanTestfriedman.test(yA|B)#whereyarethedatavalues,Aisagroupingfactor#andBisablockingfactorhttp:/www.r-,Thepackagenpmcprovidesnonparametricmultiplecomparisons.(Note:ThispackagehasbeenwithdrawnbutisstillavailableintheCRANarchives.)library(npmc)npmc(x)#wherexisadataframecontainingvariablevar#(responsevariable)andclass(groupingvariable),Example:carbscarbsattach(gfit)gfit.g=chisq.test(visit,p=ratio,rescale.p=T)gfit.ggfit.g$expgfit.g$res,Goodness-of-fitTestsforDiscreteDatawith“vcd”package,dummy-rnbinom(200,size=1.5,prob=0.8)gf-goodfit(dummy,type=nbinomial,method=MinChisq)summary(gf)plot(gf),dummy-rbinom(100,size=6,prob=0.5)gf1-goodfit(dummy,type=binomial,par=list(size=6)gf2-goodfit(dummy,type=binomial,par=list(prob=0.6,size=6)summary(gf1)plot(gf1)summary(gf2)plot(gf2),t-tests,Thet.test()functionproducesavarietyoft-tests.Unlikemoststatisticalpackages,thedefaultassumesunequalvarianceandappliestheWelshdfmodification.#independent2-groupt-testt.test(yx)#whereyisnumericandxisabinaryfactor#independent2-groupt-testt.test(y1,y2)#wherey1andy2arenumeric#pairedt-testt.test(y1,y2,paired=TRUE)#wherey1&y2arenumeric#onesamlet-testt.test(y,mu=3)#Ho:mu=3Youcanusethevar.equal=TRUEoptiontospecifyequalvariancesandapooledvarianceestimate.Youcanusethealternative=lessoralternative=greateroptiontospecifyaonetailedtest.var.test(A,B),A-scan()79.9880.0480.0280.0480.0380.0380.0479.9780.0580.0380.0280.0080.02B-scan()80.0279.9479.9879.9779.9780.0379.9579.97boxplot(A,B)t.test(A,B)var.test(A,B),ANOVA,IfyouhavebeenanalyzingANOVAdesignsintraditionalstatisticalpackages,youarelikelytofindRsapproachlesscoherentanduser-friendly.AgoodonlinepresentationonANOVAinRisavailablefromKatholiekeUniversiteitLeuven.,1.FitaModelInthefollowingexampleslowercaselettersarenumericvariablesanduppercaselettersarefactors.#OneWayAnova(CompletelyRandomizedDesign)fit-aov(yA,data=mydataframe)#RandomizedBlockDesign(Bistheblockingfactor)fit-aov(yA+B,data=mydataframe)#TwoWayFactorialDesignfit-aov(yA+B+A:B,data=mydataframe)fit-aov(yA*B,data=mydataframe)#samething#AnalysisofCovariancefit-aov(yA+x,data=mydataframe)Forwithinsubjectsdesigns,thedataframehastoberearrangedsothateachmeasurementonasubjectisaseparateobservation.SeeRandAnalysisofVariance.#OneWithinFactorfit-aov(yA+Error(Subject/A),data=mydataframe)#TwoWithinFactorsW1W2,TwoBetweenFactorsB1B2fit-aov(y(W1*W2*B1*B2)+Error(Subject/(W1*W2)+(B1*B2),data=mydataframe),Examples,aovdata-read.csv(carbs2.csv,header=TRUE)aa-read.csv(carbs.csv“)aa2-cbind(aa,id=1:10)library(reshape)aa3-melt(aa2,id=c(“id”)aovdata-aa3myaov-aov(growthsugar,data=aovdata)summary(myaov)fit-lm(growthsugar,data=aovdata)anova(fit)TukeyHSD(myaov),Examples-2,y1=c(18.2,20.1,17.6,16.8,18.8,19.7,19.1)y2=c(17.4,18.7,19.1,16.4,15.9,18.4,17.7)y3=c(15.2,18.8,17.7,16.5,15.9,17.1,16.7)y=c(y1,y2,y3)n=rep(7,3)group=rep(1:3,n)tmp=tapply(y,group,stem)stem(y)tmpfn=function(x)c(sum=sum(x),mean=mean(x),var=var(x),n=length(x)tapply(y,group,tmpfn)tmpfn(y),data=data.frame(y=y,group=factor(group)fit=lm(ygroup,data)anova(fit)df=anova(fit),Dfnames(df)=c(trt,err)alpha=c(0.05,0.01)qf(alpha,dftrt,dferr,lower.tail=FALSE)#Aconfidenceintervalonthepooledvariancecanbecomputedaswellusingtheanova(fit)object.anova(fit)Residuals,SumSqanova(fit)Residuals,SumSq/qchisq(c(0.025,0.975),18,lower.tail=FALSE),2.LookatDiagnosticPlotsDiagnosticplotsprovidechecksforheteroscedasticity,normality,andinfluentialobserverations.layout(matrix(c(1,2,3,4),2,2)#optionallayoutplot(fit)#diagnosticplotsFordetailsontheevaluationoftestrequirements,see(M)ANOVAAssumptions.,3.EvaluateModelEffectsWARNING:RprovidesTypeIsequentialSS,notthedefaultTypeIIImarginalSSreportedbySASandSPSS.Inanonorthogonaldesignwithmorethanonetermontherighthandsideoftheequationorderwillmatter(i.e.,A+BandB+Awillproducedifferentresults)!Wewillneedusethedrop1()functiontoproducethefamiliarTypeIIIresults.Itwillcompareeachtermwiththefullmodel.Alternatively,wecanuseanova(fit.model1,fit.model2)tocomparenestedmodelsdirectly.summary(fit)#displayTypeIANOVAtabledrop1(fit,.,test=F)#typeIIISSandFTests,

温馨提示

  • 1. 本站所有资源如无特殊说明,都需要本地电脑安装OFFICE2007和PDF阅读器。图纸软件为CAD,CAXA,PROE,UG,SolidWorks等.压缩文件请下载最新的WinRAR软件解压。
  • 2. 本站的文档不包含任何第三方提供的附件图纸等,如果需要附件,请联系上传者。文件的所有权益归上传用户所有。
  • 3. 本站RAR压缩包中若带图纸,网页内容里面会有图纸预览,若没有图纸预览就没有图纸。
  • 4. 未经权益所有人同意不得将文件中的内容挪作商业或盈利用途。
  • 5. 人人文库网仅提供信息存储空间,仅对用户上传内容的表现方式做保护处理,对用户上传分享的文档内容本身不做任何修改或编辑,并不能对任何下载内容负责。
  • 6. 下载文件中如有侵权或不适当内容,请与我们联系,我们立即纠正。
  • 7. 本站不保证下载资源的准确性、安全性和完整性, 同时也不承担用户因使用这些下载资源对自己和他人造成任何形式的伤害或损失。

评论

0/150

提交评论