统计可视化的颜值天花板：ggstatsplot

Stay hungry, stay foolish!

简介
安装
基本函数

ggbetweenstats
grouped_ggbetweenstats
gghistostats
ggdotplotstats
ggcorrmat
ggscatterstats
ggcoefstats

简介

ggstatsplot是ggplot2包的扩展，用于创建图标本身及详细的统计结果。在典型的探索性数据分析工作流中，数据可视化和统计建模是两个不同的阶段。ggstatsplot的核心思想很简单:以包含详细统计细节的图形形式将这两个阶段合并为一个阶段，这使得数据探索更简单、更快。

使用时需要引用：

“
Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach. Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167

只用一行代码，该函数就产生了关于描述性统计、统计推断、效应量估计及其不确定性、两两比较、贝叶斯假设检验、贝叶斯后验估计及其不确定性的详细信息。

library(ggstatsplot)
## Registered S3 method overwritten by 'parameters':
##   method                         from      
##   format.parameters_distribution datawizard
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167

ggbetweenstats(iris, Species, Sepal.Length)

这个包很大程度上依赖easystats和statsExpressions包，感兴趣的小伙伴可以自己查看或者等我更新。:smile

安装

# 2选1
install.packages("ggstatsplot")

remotes::install_github("IndrajeetPatil/ggstatsplot")

基本函数

ggbetweenstats

用于组间比较。下面是一个4组间比较的例子。

library(gapminder)

dplyr::glimpse(x = gapminder::gapminder)
## Rows: 1,704
## Columns: 6
## $ country   <fct> "Afghanistan", "Afghanistan", "Afghanistan", "Afghanistan", ~
## $ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, ~
## $ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 1997, ~
## $ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 40.8~
## $ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372, 12~
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.1134, ~

## 可信区间是通过bootstrapping方法产生的，因此需要指定种子数
set.seed(123)

ggbetweenstats(
  data = dplyr::filter(gapminder::gapminder, year == 2007, continent != "Oceania"),
  x = continent,
  y = lifeExp
)

自动选择t检验（2组）或者方差分析（多于2组）
返回ggplot2对象，可以无缝对接ggplot2语法

还可以通过超多参数自定义图形！

library(ggplot2)

set.seed(123)
ggbetweenstats(
  data = dplyr::filter(gapminder, year == 2007, continent != "Oceania"),
  x = continent, ## 横坐标/自变量
  y = lifeExp, ## 纵坐标/因变量
  type = "robust", ## 4选1,：parametric,nonparametric，robust，bayes
  xlab = "Continent", 
  ylab = "Life expectancy", 
  plot.type = "boxviolin", ## 3选1，box,violin,boxviolin
  outlier.tagging = TRUE, ## 是否标注离群值
  outlier.coef = 1.5, ## coefficient for Tukey's rule
  outlier.label = country, ## label to attach to outlier values
  outlier.label.args = list(color = "red"), ## outlier point label color
  ggtheme = ggplot2::theme_minimal(), ## 换主题
  package = "yarrr", ## 提供配色方案的R包，可以通过View(paletteer::palettes_d_names)查看支持的R包
  palette = "info2", ## 配色方案
  title = "Comparison of life expectancy across continents (Year: 2007)",
  caption = "Source: Gapminder Foundation"
) + # 加ggplot2语法
  scale_y_continuous(
    limits = c(35, 85),
    breaks = seq(from = 35, to = 85, by = 5)
  )

提供一个combine_plots()函数用于多个图形的组合：

set.seed(123)

df_year <- dplyr::filter(gapminder::gapminder, year == 2007 | year == 1957)

## parametric t-test and box plot
p1 <- ggbetweenstats(
  data = df_year,
  x = year,
  y = lifeExp,
  xlab = "Year",
  ylab = "Life expectancy",
  plot.type = "box",
  type = "p",
  conf.level = 0.99,
  title = "Parametric test",
  package = "ggsci",
  palette = "nrc_npg"
)

## Mann-Whitney U test (nonparametric t) and violin plot
p2 <- ggbetweenstats(
  data = df_year,
  x = year,
  y = lifeExp,
  xlab = "Year",
  ylab = "Life expectancy",
  plot.type = "violin",
  type = "np",
  conf.level = 0.99,
  title = "Non-parametric Test (violin plot)",
  package = "ggsci",
  palette = "uniform_startrek"
)

## combining the individual plots into a single plot
combine_plots(
  list(p1, p2),
  plotgrid.args = list(nrow = 1),
  annotation.args = list(
    title = "Comparison of life expectancy between 1957 and 2007",
    caption = "Source: Gapminder Foundation"
  )
)

grouped_ggbetweenstats

这个函数很牛逼，类似于先分组再画图，比如你想根据一个变量分组，然后画多张图，放在一个图形中。有点类似group_by的感觉，但不是一样的。

下面介绍的所有函数都是支持这一特性的！

set.seed(123)

gapminder::gapminder %>%
  dplyr::filter(year %in% c(1967, 1987, 2007), continent != "Oceania") %>%
  grouped_ggbetweenstats(
    x = continent,
    y = lifeExp,
    grouping.var = year,
    xlab = "Continent",
    ylab = "Life expectancy",
    pairwise.display = "significant", ## 只显示显著的
    p.adjust.method = "fdr", 
    package = "ggsci",
    palette = "default_jco",
    outlier.tagging = TRUE,
    outlier.label = country,
    ## 用于拼图的参数
    annotation.args = list(title = "Changes in life expectancy across continents (1967-2007)"),
    plotgrid.args = list(nrow = 3)
  )

这个图非常棒，分别展示了1967年，1987年，2007年的不同地区，人均寿命的变化！

这个图形也可以通过ggbetweenstats和purrr包实现。

还有一个ggwithinstats函数和grouped_ggwithinstats，形式和上面两个完全一样，就不介绍了。

gghistostats

用于可视化直方图，当然也有一个grouped_gghistostats()函数用于分组画图。

set.seed(123)

gghistostats(
  data = psych::sat.act, 
  x = ACT, ## 数值型
  xlab = "ACT Score", 
  title = "Distribution of ACT Scores", 
  test.value = 20, 
  caption = "Data courtesy of: SAPA project (https://sapa-project.org)"
)

ggdotplotstats

画点图。当然也是支持grouped_ggdotplotstats

set.seed(123)
library(ggstatsplot)

df <- dplyr::filter(ggplot2::mpg, cyl %in% c("4", "6"))

paletter_vector <-
  paletteer::paletteer_d(
    palette = "palettetown::venusaur",
    n = nlevels(as.factor(df$manufacturer)),
    type = "discrete"
  )

ggdotplotstats(
  data = df,
  x = cty,
  y = manufacturer,
  xlab = "city miles per gallon",
  ylab = "car manufacturer",
  test.value = 15.5,
  point.args = list(
    shape = 16,
    color = paletter_vector,
    size = 5
  ),
  title = "Distribution of mileage of cars",
  ggtheme = ggplot2::theme_dark()
)

ggcorrmat

可视化性关矩阵，之前介绍过一个全能的相关矩阵可视化R包：corrplot。两者在功能上有很多重复，不过今天这个在拼图方面很有优势。

set.seed(123)

ggcorrmat(
  data = dplyr::sample_frac(ggplot2::diamonds, size = 0.05),
  cor.vars = c(carat, depth:z), ## 选择变量
  cor.vars.names = c(
    "carat",
    "total depth",
    "table",
    "price",
    "length (in mm)",
    "width (in mm)",
    "depth (in mm)"
  ),
  ggcorrplot.args = list(outline.color = "black", hc.order = TRUE)
)

是不是看起来很美观，其实借用了ggcorrplot包。🤪

接下来看看比较有意思的组合图形：grouped_ggcorrplot

set.seed(123)

grouped_ggcorrmat(
  ## 和`ggcorrmat`相关的参数
  data = ggplot2::diamonds,
  type = "bayes", 
  grouping.var = cut,
  ## 和`combine_plots`有关的参数
  plotgrid.args = list(nrow = 3),
  annotation.args = list(
    tag_levels = "a",
    title = "Relationship between diamond attributes and price across cut",
    caption = "Dataset: Diamonds from ggplot2 package"
  )
)

颜值还可以吧？

ggscatterstats

散点图，适合两个变量的相关性探索，自动添加边际图形，支持更换图形类型，支持grouped_ggscatterstats()。

set.seed(123)

ggscatterstats(
  data = movies_long, 
  x = budget, 
  y = rating, 
  xlab = "Budget (in millions of US dollars)", ## label for the x-axis
  ylab = "Rating on IMDB", ## label for the y-axis
  label.var = title, 
  label.expression = rating < 5 & budget > 100, ## 哪些点打标签
  point.label.args = list(alpha = 0.7, size = 4, color = "grey50"),
  xfill = "#CC79A7", ## fill for marginals on the x-axis
  yfill = "#009E73", ## fill for marginals on the y-axis
  title = "Relationship between movie budget and IMDB rating",
  caption = "Source: www.imdb.com"
)
## Registered S3 method overwritten by 'ggside':
##   method from   
##   +.gg   ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

画饼图。

set.seed(123)

ggpiestats(
  data = Titanic_full,
  x = Survived,
  y = Sex,
  title = "Passenger survival on the Titanic by gender", 
  caption = "Source: Titanic survival dataset", 
  legend.title = "Survived?", 
  ggtheme = ggplot2::theme_grey(), 
  palette = "category10_d3", 
  package = "ggsci", 
  k = 3, ## 结果中的小数点位置
  perc.k = 1 ## 百分比中的小数点位置
) + 
  ggplot2::theme(
    plot.title = ggplot2::element_text(
      color = "black",
      size = 14,
      hjust = 0
    )
  )

ggcoefstats

可视化模型的可信区间，支持100多个常见的模型！！比如各种回归模型，生存分析，方差分析等等，非常全面！

# 查看支持的模型
head(insight::supported_models())
## [1] "aareg"     "afex_aov"  "AKP"       "Anova.mlm" "aov"       "aovlist"

# 随便尝试一个
library(survival)
set.seed(123)

afit <- survival::aareg(
  formula = Surv(time, status) ~ age + sex + ph.ecog,
  data = lung,
  dfbeta = TRUE
)

# plot
ggcoefstats(
  x = afit,
  title = "Aalen's additive regression model",
  subtitle = "(for censored data)",
  k = 3
)

非常好看！

看到这里都是真爱，这个包还有很多细节都没介绍到，比如对于不同的数据会如何选取合适的统计方法，各种自定义的选项等等，这些在官网^[1]都有详细的介绍，每个函数都会以表格的形式列出来，大家感兴趣的可以自行查看哦。

参考资料

[1]

ggstatsplot官网: https://indrajeetpatil.github.io/ggstatsplot/index.html

以上就是今天的内容，希望对你有帮助哦！欢迎点赞、在看、关注、转发！

本站仅提供存储服务，所有内容均由用户发布，如发现有害或侵权内容，请点击举报。