library(tidyverse)
library(tidytuesdayR)
Himalayan expedition data
Packages used
Get the data
= tidytuesdayR::tt_load(2025, week = 3) himalayan.data
Assign different data to different variables
= himalayan.data$exped_tidy
exped.data = himalayan.data$peaks_tidy peak.data
Check the data
head(exped.data)
# A tibble: 6 × 69
EXPID PEAKID YEAR SEASON SEASON_FACTOR HOST HOST_FACTOR ROUTE1 ROUTE2 ROUTE3
<chr> <chr> <dbl> <dbl> <chr> <dbl> <chr> <chr> <chr> <lgl>
1 EVER… EVER 2020 1 Spring 2 China N Col… <NA> NA
2 EVER… EVER 2020 1 Spring 2 China N Col… <NA> NA
3 EVER… EVER 2020 1 Spring 2 China N Col… <NA> NA
4 AMAD… AMAD 2020 3 Autumn 1 Nepal SW Ri… <NA> NA
5 AMAD… AMAD 2020 3 Autumn 1 Nepal SW Ri… <NA> NA
6 AMAD… AMAD 2020 3 Autumn 1 Nepal SW Ri… <NA> NA
# ℹ 59 more variables: ROUTE4 <lgl>, NATION <chr>, LEADERS <chr>,
# SPONSOR <chr>, SUCCESS1 <lgl>, SUCCESS2 <lgl>, SUCCESS3 <lgl>,
# SUCCESS4 <lgl>, ASCENT1 <chr>, ASCENT2 <chr>, ASCENT3 <lgl>, ASCENT4 <lgl>,
# CLAIMED <lgl>, DISPUTED <lgl>, COUNTRIES <chr>, APPROACH <chr>,
# BCDATE <date>, SMTDATE <date>, SMTTIME <chr>, SMTDAYS <dbl>, TOTDAYS <dbl>,
# TERMDATE <date>, TERMREASON <dbl>, TERMREASON_FACTOR <chr>, TERMNOTE <chr>,
# HIGHPOINT <dbl>, TRAVERSE <lgl>, SKI <lgl>, PARAPENTE <lgl>, CAMPS <dbl>, …
colnames(exped.data)
[1] "EXPID" "PEAKID" "YEAR"
[4] "SEASON" "SEASON_FACTOR" "HOST"
[7] "HOST_FACTOR" "ROUTE1" "ROUTE2"
[10] "ROUTE3" "ROUTE4" "NATION"
[13] "LEADERS" "SPONSOR" "SUCCESS1"
[16] "SUCCESS2" "SUCCESS3" "SUCCESS4"
[19] "ASCENT1" "ASCENT2" "ASCENT3"
[22] "ASCENT4" "CLAIMED" "DISPUTED"
[25] "COUNTRIES" "APPROACH" "BCDATE"
[28] "SMTDATE" "SMTTIME" "SMTDAYS"
[31] "TOTDAYS" "TERMDATE" "TERMREASON"
[34] "TERMREASON_FACTOR" "TERMNOTE" "HIGHPOINT"
[37] "TRAVERSE" "SKI" "PARAPENTE"
[40] "CAMPS" "ROPE" "TOTMEMBERS"
[43] "SMTMEMBERS" "MDEATHS" "TOTHIRED"
[46] "SMTHIRED" "HDEATHS" "NOHIRED"
[49] "O2USED" "O2NONE" "O2CLIMB"
[52] "O2DESCENT" "O2SLEEP" "O2MEDICAL"
[55] "O2TAKEN" "O2UNKWN" "OTHERSMTS"
[58] "CAMPSITES" "ROUTEMEMO" "ACCIDENTS"
[61] "ACHIEVMENT" "AGENCY" "COMRTE"
[64] "STDRTE" "PRIMRTE" "PRIMMEM"
[67] "PRIMREF" "PRIMID" "CHKSUM"
Disclaimer
Since 8000 meter peaks are the most prestigious ones in terms of mountaineering, we will create a separate data file containing only the 8000m peaks. First, get the names of the peaks from the peak.data
file.
= peak.data |> filter(HEIGHTM >= 8000) |> pull(PEAKID) peak_8k_names
Just to know the names of the peaks, we can extract the PKNAME
instead of PEAKID
|> filter(HEIGHTM >= 8000) |> pull(PKNAME) peak.data
[1] "Annapurna I" "Annapurna I East" "Annapurna I Middle"
[4] "Cho Oyu" "Dhaulagiri I" "Everest"
[7] "Kangchenjunga Central" "Kangchenjunga" "Kangchenjunga South"
[10] "Lhotse" "Lhotse Shar" "Makalu"
[13] "Manaslu" "Yalung Kang" "Lhotse Middle"
[16] "Yalung Kang West"
We can see there are 16 different names. Although there are some names missing from the actual list of 8000m peaks. I have no idea why some are missing.
Now we extract the expedition details of these 8000m peaks.
.8k = exped.data |> filter(PEAKID %in% peak_8k_names) exped
Questions
1. What is the most successful route for the expedition for 8000m peaks?
There are 4 different routes enlisted for each peaks, assigned by ROUTE1
, ROUTE2
, ROUTE3
and ROUTE4
. The successful climbing using these routes are stored in corresponding logical vector SUCCESS1
, SUCCESS2
, SUCCESS3
, SUCCESS4
. We will use these SUCCESS data along with the exped.8k
dataset.
.8k |> pivot_longer(cols = c(SUCCESS1, SUCCESS2, SUCCESS3, SUCCESS4),
expednames_to = "route_success",
values_to = "success") |>
group_by(route_success) |>
summarise(success_rate = (mean(success, na.rm = TRUE))*100) |>
ggplot(aes(x = route_success, y = success_rate, fill = route_success)) +
geom_col(width = 0.6, color = "black") +
labs(
title = "Success rate by route",
x = "Route (1-4)",
y = "Success rate (%)"
)
2. Which countries has lead the most successful expedition?
Again, we will use the SUCCESS
vector; now with the NATION
vector; with the exped.data
for the details of the all peaks.
|> filter(SUCCESS1 == TRUE | SUCCESS2 == TRUE | SUCCESS3 == TRUE | SUCCESS4 == TRUE) |>
exped.data count(NATION, sort = TRUE) |>
ggplot(aes(x = reorder(NATION, n), y = n)) +
geom_col(fill = "steelblue") +
geom_text(aes(label = n), hjust = -0.1, size = 3) +
labs(
title = "Successful expedition by nation",
x = "Nation",
y = "Number of successful expedition"
+
) coord_flip()
What about the 8000m peaks
We will just use the exped.8k
data instead of exped.data
from the previous chunk.
.8k |> filter(SUCCESS1 == TRUE | SUCCESS2 == TRUE | SUCCESS3 == TRUE | SUCCESS4 == TRUE) |>
expedcount(NATION, sort = TRUE) |>
ggplot(aes(x = reorder(NATION, n), y = n)) +
geom_col(fill = "orangered4") +
geom_text(aes(label = n), hjust = -0.1, size = 3) +
labs(
title = "Successful expedition by nation on 8000m peaks",
x = "Nation",
y = "Number of successful expedition"
+
) coord_flip()
What about the one we actually care about (Everest
)
.8k |> filter(SUCCESS1 == TRUE | SUCCESS2 == TRUE | SUCCESS3 == TRUE | SUCCESS4 == TRUE) |>
expedfilter(PEAKID == "EVER") |> #just need to filter the data for Everest
count(NATION, sort = TRUE) |>
ggplot(aes(x = reorder(NATION, n), y = n)) +
geom_col(fill = "lightsalmon3") +
geom_text(aes(label = n), hjust = -0.1, size = 3) +
labs(
title = "Successful expedition by nation on Everest",
x = "Nation",
y = "Number of successful expedition"
+
) coord_flip()
3. How does the travel time vary in different seasons?
|> filter(!is.na(TOTDAYS), !is.na(SEASON_FACTOR)) |>
exped.data ggplot(aes(x = SEASON_FACTOR, y = TOTDAYS, fill = SEASON_FACTOR)) +
geom_jitter(aes(colour = SEASON_FACTOR)) +
scale_color_viridis_d(alpha = 1) +
labs(
title = "Expedition duration by season",
x = "Season",
y = "Total days of expedition"
+
) theme_minimal() +
theme(legend.position = "none")
4. Most common reasons for expedition termination
|> count(TERMREASON_FACTOR, sort=TRUE) |>
exped.data ggplot(aes(x = reorder(TERMREASON_FACTOR,n), y = n, fill = n)) +
geom_col() + geom_text(aes(label = n, hjust = -0.5)) +
coord_flip() +
labs(
title = "Common reasons for expedition termination",
x = "Number",
y = "Reasons"
)
5. Which peak has the highest fatality rate?
|> left_join(select(peak.data, PEAKID, PKNAME), by = "PEAKID") |>
exped.data group_by(PKNAME) |>
summarise(
expeditions = n(),
total.deaths = sum(MDEATHS+HDEATHS),
fatality.rate = total.deaths / expeditions) |>
arrange(fatality.rate) |>
slice_max(fatality.rate, n = 10) |>
ggplot(aes(x = reorder(PKNAME, fatality.rate), y = fatality.rate)) +
geom_col(fill = "maroon") +
coord_flip() +
labs(
title = "Fatality rate in himalayan peaks",
x = "Peak name",
y = "Fatality rate"
+
) theme_minimal()
Among the 8000m peaks
.8k |> left_join(select(peak.data, PEAKID, PKNAME), by = "PEAKID") |>
expedgroup_by(PKNAME) |>
summarise(
expeditions = n(),
total.deaths = sum(MDEATHS+HDEATHS),
fatality.rate = total.deaths / expeditions) |>
arrange(fatality.rate) |>
#slice_max(fatality.rate, n = 10) |>
ggplot(aes(x = reorder(PKNAME, fatality.rate), y = fatality.rate)) +
geom_col(fill = "firebrick3") +
coord_flip() +
labs(
title = "Fatality rate in 8k peaks",
x = "Peak name",
y = "Fatality rate"
+
) theme_minimal()
In case of number of fatalities
on the peaks
|> left_join(select(peak.data, PEAKID, PKNAME), by = "PEAKID") |>
exped.data group_by(PKNAME) |>
summarise(total.deaths = sum(MDEATHS+HDEATHS)) |>
arrange(total.deaths) |>
slice_max(total.deaths, n = 10) |>
ggplot(aes(x = reorder(PKNAME, total.deaths), y = total.deaths)) +
geom_col(fill = "red4") +
coord_flip() +
labs(
title = "Number of deaths in himalayan peaks",
x = "Peak name",
y = "No. of death"
+
) geom_text(aes(label = total.deaths), hjust = -.3) +
theme_minimal()
6. Does the use of supplemental oxygen affect the success rate? and death rate?
|> mutate(success = SUCCESS1 | SUCCESS2 | SUCCESS3 | SUCCESS4,
exped.data death = MDEATHS + HDEATHS) |>
group_by(O2USED) |>
summarise(success.rate = (mean(success, na.rm = TRUE)),
death.rate = mean(death > 0, na.rm = TRUE)) |>
pivot_longer(cols = c(success.rate, death.rate), names_to = "names", values_to = "rate") |>
ggplot(aes(x = O2USED, y = rate, fill = names)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Impact of supplemental Oxygen use on success and deaths",
x = "Oxygen Used", y = "Rate")
7. Which peaks have the most attempts but lowest summit rate? (Hardest peaks to summit)
|>
exped.data mutate(success = SUCCESS1 | SUCCESS2 | SUCCESS3 | SUCCESS4) |>
group_by(PEAKID) |>
summarise(
attempts = n(),
successes = sum(success, na.rm = TRUE),
summit.rate = successes / attempts) |>
left_join(select(peak.data, PEAKID, PKNAME), by = "PEAKID") |>
filter(!is.na(PKNAME), attempts >= 5) |>
arrange(attempts) |>
slice_head(n = 10) |>
arrange(summit.rate) |>
#slice_head(n = 10) |>
ggplot(aes(x = reorder(PKNAME, desc(summit.rate)), y = summit.rate)) +
geom_col(fill = "darkred") +
geom_text(aes(label = scales::percent(summit.rate, accuracy = 0.1)), hjust = -0.1) +
coord_flip() +
labs(
title = "Top 10 Hardest Peaks to Summit",
subtitle = "Peaks with most attempts but lowest summit success rate",
x = "Peak Name",
y = "Summit Success Rate"
+
) theme_minimal()