Get the coordinates of Volcanos from Wikipedia page

You can find many information in Wikipedia pages but sometimes in can be long to gather them manually. R can be used as a sucker of information. Let see how it can retrieve the coordinates of volcanos in Central America. The web pages are here:
https://en.wikipedia.org/wiki/List_of_volcanoes_in_Mexico
https://en.wikipedia.org/wiki/List_of_volcanoes_in_Guatemala
https://en.wikipedia.org/wiki/List_of_volcanoes_in_El_Salvador
https://en.wikipedia.org/wiki/List_of_volcanoes_in_Nicaragua
https://en.wikipedia.org/wiki/List_of_volcanoes_in_Costa_Rica


library(maps)
library(mapdata)
library(maptools)
library(XML)


# Note a problem in readHTMLTable. It cannot be used directly with url:

# readHTMLTable(url, header=FALSE, stringsAsFactors = FALSE) generates an error # load a file in temporary directory
url <- "https://en.wikipedia.org/wiki/List_of_volcanoes_in_Mexico"
dest <- paste(tempdir(), "/List_of_volcanoes_in_Mexico.html", sep="")
download.file(url, dest)

tables <- readHTMLTable(dest, header=FALSE, stringsAsFactors = FALSE)
tables <- tables[unlist(lapply(tables, function(x) !is.null(x)))]
Volcano <- cbind(tables[[3]][-c(1,2),], Country="Mexico")


url <- "https://en.wikipedia.org/wiki/List_of_volcanoes_in_Guatemala"
dest <- paste(tempdir(), "/List_of_volcanoes_in_Guatemala.html", sep="")
download.file(url, dest)

tables <- readHTMLTable(dest, header=FALSE, stringsAsFactors = FALSE)
tables <- tables[unlist(lapply(tables, function(x) !is.null(x)))]
Volcano <- rbind(Volcano, cbind(tables[[2]][-1,], Country="Guatemala"))

url <- "https://en.wikipedia.org/wiki/List_of_volcanoes_in_El_Salvador"
dest <- paste(tempdir(), "/List_of_volcanoes_in_El_Salvador.html", sep="")
download.file(url, dest)

tables <- readHTMLTable(dest, header=FALSE, stringsAsFactors = FALSE)
tables <- tables[unlist(lapply(tables, function(x) !is.null(x)))]
Volcano <- rbind(Volcano, cbind(tables[[2]][-1,], Country="El Salvador"))

url <- "https://en.wikipedia.org/wiki/List_of_volcanoes_in_Nicaragua"
dest <- paste(tempdir(), "/List_of_volcanoes_in_Nicaragua.html", sep="")
download.file(url, dest)

tables <- readHTMLTable(dest, header=FALSE, stringsAsFactors = FALSE)
tables <- tables[unlist(lapply(tables, function(x) !is.null(x)))]
Volcano <- rbind(Volcano, cbind(tables[[2]][-1,], Country="Nicaragua"))

url <- "https://en.wikipedia.org/wiki/List_of_volcanoes_in_Costa_Rica"
dest <- paste(tempdir(), "/List_of_volcanoes_in_Costa_Rica", sep="")
download.file(url, dest)

tables <- readHTMLTable(dest, header=FALSE, stringsAsFactors = FALSE)
tables <- tables[unlist(lapply(tables, function(x) !is.null(x)))]
Volcano <- rbind(Volcano, cbind(tables[[2]][-c(1,2),], Country="Costa Rica"))

colnames(Volcano) <- c("Name", "Altitude", "Elevation", "Coordinates", "Eruption", "Country")
row.names(Volcano) <- as.character(seq(from=1, to=nrow(Volcano), by=1))
Volcano[Volcano[, "Coordinates"]=="-" | Volcano[, "Coordinates"]=="—" | is.na(Volcano[, "Coordinates"]), "Coordinates"] <- NA
Volcano <- Volcano[!is.na(Volcano[, "Coordinates"]), ]

lcoordinates <- strsplit(Volcano[, "Coordinates"], " / ")
lcoordinates2 <- lapply(lcoordinates, function(x) gsub(" \\(.+\\)", "", x[3]))
lcoordinates3 <- lapply(lcoordinates2, function(x) unlist(strsplit(x, "; ")))
lcoordinates4 <- lapply(lcoordinates3, function(x) ifelse(is.na(x), return(c(NA, NA)), x))
lcoordinates5 <- lapply(lcoordinates4, function(x) iconv(x, "", "ASCII", ""))
lc <- matrix(unlist(lcoordinates5), ncol=2, byrow=TRUE)
Volcano <- cbind(Volcano, latitude=as.numeric(lc[,1]), longitude=as.numeric(lc[,2]))

map('worldHires', xlim=range(Volcano[,"longitude"], na.rm=TRUE), ylim=range(Volcano[,"latitude"], na.rm=TRUE), mar=c(4, 5, 1, 1))
points(Volcano[,"longitude"], Volcano[,"latitude"], col="red", pch=19)
grid()

degAxis(1)
degAxis(2, las=1)

Commentaires

Posts les plus consultés de ce blog

Standard error from Hessian Matrix... what can be done when problem occurs

Install treemix in ubuntu 20.04

stepAIC from package MASS with AICc