.Rhistory

)
count <-
c(
NROW(otherRankData),
NROW(otherRankData[otherRankData$taxonRank == "GENUS", ]),
NROW(otherRankData[otherRankData$taxonRank == "FAMILY", ]),
NROW(otherRankData[otherRankData$taxonRank == "ORDER", ]),
NROW(otherRankData[otherRankData$taxonRank == "CLASS", ])
)
percentageTable <- data.frame(count, row.names = names)
total <- NROW(GBIF_Data)
unresolved <- NROW(otherRankData)
percentageTable$percentageOfTotalData <-
round(percentageTable$count / total * 100, 2)
percentageTable$percentageOfUnresolvedData <-
round(percentageTable$count / unresolved * 100, 2)
# --------- End of Percentage Table -----------#
# --------- Names Table -----------#
namesToResolve <-
sort(table(otherRankData$scientificName), decreasing = T) # Sorting the names by frequency,
# so that the names to be resolved first, are greater part of the complete data
count <-
as.vector(namesToResolve) # the count of records for each scientific names
if (resolve == dim(GBIF_Data)[1]) {
resolve <- length(count)
}
namesPercentageTable <-
data.frame(
count,
percentOfUnresolvedData = round(count / unresolved * 100, 2),
row.names = (names(namesToResolve))
)
# --------- End of Names Table -----------#
# --------- Misspellings -----------#
misspellingsData <-
sapply(names(namesToResolve)[1:resolve], function(name) {
namelookup <- tryCatch(
# this error catch is because name_lookup throws error and halts
# program when unknown query is provided
rgbif::name_lookup(query = name),
error = function(e)
e
)
return(unique(namelookup$data$scientificName))
})
# --------- End of Misspellings -----------#
# --------- Resolving -----------#
if (simplify == FALSE) {
answer <-
sapply(names(namesToResolve)[1:resolve], function(nameToResolve) {
s <- Sys.time()
# Finding range for later use
lat <-
as.numeric(otherRankData[otherRankData$scientificName == nameToResolve, ]$decimalLatitude)
lat <-
lat[lat != 0]  # some records have 0.0000 as the NA equivalent. So its not the
# actual 0.00 coordinate value but representation of missing value
long <-
as.numeric(otherRankData[otherRankData$scientificName == nameToResolve, ]$decimalLongitude)
long <-
long[long != 0]# some records have 0.0000 as the NA equivalent. So its not the
# actual 0.00 coordinate value but representation of missing value
df <- data.frame(long, lat)
originLat <-
range(lat, na.rm = T)
originLong <-
range(long, na.rm = T)
# End of Finding range for later use
# Stripping only the first two parts of scientific name. This is because,
# gnr_resolve works well only when giving first 2 parts
originName <- nameToResolve
nameToResolve <-
sub("^(\\S*\\s+\\S+).*", "\\1", (nameToResolve)) # TODO fix tailing ',' problem
# Finding probable species names
gnrResults <- gnr_resolve(nameToResolve)
gnrResults <-
gnrResults[gnrResults$score >= gnr_score, ]
candidateList <- unique(gnrResults$matched_name)
possibleSpecies <-
sapply(candidateList, function(candidate) {
namelookup <-
tryCatch(
# this error catch is because name_lookup throws error and halts
# program when unknown query is provided
name_lookup(query = candidate),
error = function(e)
e
)  # TODO: filter using parameters
if (upto == "family") {
log <-
namelookup$data$rank == "SPECIES" |
namelookup$data$rank == "SUBSPECIES" |
namelookup$data$rank == "VARIETY" |
namelookup$data$rank == "GENUS" |
namelookup$data$rank == "FAMILY"
} else if (upto == "genus") {
log <-
namelookup$data$rank == "SPECIES" |
namelookup$data$rank == "SUBSPECIES" |
namelookup$data$rank == "VARIETY" |
namelookup$data$rank == "GENUS"
} else if (upto == "species") {
log <-
namelookup$data$rank == "SPECIES" |
namelookup$data$rank == "SUBSPECIES" |
namelookup$data$rank == "VARIETY"
} else if (upto == "subspecies") {
log <- namelookup$data$rank == "SUBSPECIES"
}
dat <-
namelookup$data[log , ]
unique(dat$scientificName)
})
possibleSpecies <-
unique(as.vector(unlist(possibleSpecies)))
possibleSpecies <-
possibleSpecies[!is.na(possibleSpecies)] # sometimes name_lookup gives NA as a possible name
# --------- End of Finding probable species names
# --------- Plotting the origin data
latLonCenter <-
suppressMessages(geocode(location))
newmap <- suppressMessages(get_map(
location = c(lon = latLonCenter$lon, lat = latLonCenter$lat),
zoom = zoom
))
plot <- ggmap(newmap)
plot <-
plot + geom_point(
data = df,
aes(long, lat),
color = "blue",
size = 4
)
# suppressMessages(print(plot))
dir.create("TaxonResults")
path <-
paste("TaxonResults//",
originName,
sep = "")
dir.create(path)
path <-
paste("TaxonResults//",
originName,
"//",
originName,
".jpg",
sep = "")
dev.print(png,
path,
width = 1500,
height = 1500)
# Getting data from other sources for probable species
ranges <-
sapply(possibleSpecies, function(name) {
key <-
name_backbone(name = name)[1]$usageKey
count <- occ_count(taxonKey = key)
occspocc <-
occ(
query = name,
from = c("vertnet", "bison", "ecoengine"),
limit = 10000
)
occspoccDF <- occ2df(occspocc)
lat <-
range(as.numeric(occspoccDF$latitude) , na.rm = TRUE)
long <-
range(as.numeric(occspoccDF$longitude) , na.rm = TRUE)
t <-
data.frame(
latOriginHigh = originLat[1],
latOriginLow = originLat[2],
longOriginHigh = originLong[1],
longOriginLow = originLong[2],
lathigh = lat[1],
latLow = lat[2],
longhigh = long[1],
longLow = long[2],
otherSources = dim(occspoccDF)[1],
GBIFRecords = count,
time = Sys.time() - s
)
if (dim(occspoccDF)[1] > 15) {
suppressMessages(newmap <-
get_map(
location = c(lon = latLonCenter$lon, lat = latLonCenter$lat),
zoom = zoom
))
plot <- ggmap(newmap)
occspoccDF$longitude <-
as.numeric(occspoccDF$longitude)
occspoccDF$latitude <-
as.numeric(occspoccDF$latitude)
plot <-
plot + geom_point(
data = occspoccDF,
aes(longitude, latitude),
color = "red",
size = 4
)
# suppressMessages(print(plot))
path <-
paste("TaxonResults//",
originName,
"//",
name,
".jpg",
sep = "")
dev.print(png,
path,
width = 1500,
height = 1500)
}
t
})
# End of Getting data for probable species
ranges <-
as.data.frame(t(ranges)) # converting from matrix to data frame
ranges <- as.data.frame(lapply(ranges, unlist))
ranges <-
ranges[order(ranges[, 9], decreasing = T), ] # sorting by otherSources data count
return(list(ranges))
})
} else {
answer <-
sapply(names(namesToResolve)[1:resolve], simplify = FALSE, USE.NAMES = TRUE, function(nameToResolve) {
s <- Sys.time()
# Stripping only the first two parts of scientific name. This is because,
# gnr_resolve works well only when giving first 2 parts
originName <- nameToResolve
nameFirstPart <-
unlist(strsplit(originName, split = ' ', fixed = TRUE))[1]
similarNamesOfOrigin <- tryCatch(
# this error catch is because name_lookup throws error and halts
# program when 'Error in `$<-.data.frame`(`*tmp*`, "taxonrank", value = character(0)) :
# replacement has 0 rows, data has 1 '
suppressMessages(sci2comm(nameFirstPart)),
error = function(e)
e
)  # TODO: filter using parameters
similarNamesOfOrigin <- similarNamesOfOrigin[[1]]
#print(class(similarNamesOfOrigin))
remarks <-
otherRankData[otherRankData$scientificName == originName, ]
remarks <- remarks$occurrenceRemarks
remarks <- remarks[remarks != ""]
remarks <-
remarks[remarks != "occurrenceRemarks withheld"]
remarks <- names(summary(as.factor(remarks)))
#print(class(remarks))
nameToResolve <-
sub("^(\\S*\\s+\\S+).*", "\\1", (nameToResolve)) # TODO fix tailing ',' problem
k <- Sys.time()
# Finding probable species names
gnrResults <- gnr_resolve(nameToResolve)
gnrResults <-
gnrResults[gnrResults$score >= gnr_score, ]
candidateList <- unique(gnrResults$matched_name)
possibleSpecies <-
lapply(candidateList, function(candidate) {
namelookup <-
tryCatch(
# this error catch is because name_lookup throws error and halts
# program when unknown query is provided
name_lookup(query = candidate),
error = function(e)
e
)  # TODO: filter using parameters
if (upto == "family") {
log <-
namelookup$data$rank == "SPECIES" |
namelookup$data$rank == "SUBSPECIES" |
namelookup$data$rank == "VARIETY" |
namelookup$data$rank == "GENUS" |
namelookup$data$rank == "FAMILY"
} else if (upto == "genus") {
log <-
namelookup$data$rank == "SPECIES" |
namelookup$data$rank == "SUBSPECIES" |
namelookup$data$rank == "VARIETY" |
namelookup$data$rank == "GENUS"
} else if (upto == "species") {
log <-
namelookup$data$rank == "SPECIES" |
namelookup$data$rank == "SUBSPECIES" |
namelookup$data$rank == "VARIETY"
} else if (upto == "subspecies") {
log <- namelookup$data$rank == "SUBSPECIES"
}
dat <- namelookup$data[log , ]
dat <- sub("^(\\S*\\s+\\S+).*",
"\\1",
(dat$scientificName))
unique(dat)
})
possibleSpecies <-
unique(as.vector(unlist(possibleSpecies)))
possibleSpecies <-
possibleSpecies[!is.na(possibleSpecies)] # sometimes name_lookup gives NA as a possible name
# End of Finding probable species names
# Getting data from other sources for probable species
ranges <-
sapply(possibleSpecies, function(name) {
#print(name)
key <-
name_backbone(name = name)[1]$usageKey
count <- occ_count(taxonKey = key)
similarNames <- tryCatch(
# this error catch is because name_lookup throws error and halts
# program when 'Error in `$<-.data.frame`(`*tmp*`, "taxonrank", value = character(0)) :
# replacement has 0 rows, data has 1 '
suppressMessages(sci2comm(name)),
error = function(e)
e
)  # TODO: filter using parameters
similarNames <- similarNames[[1]]
list(similarNames)
})
k <- list(
RemarksOnOriginalData = remarks,
CommonNamesOfOriginalData = similarNamesOfOrigin,
CommonNamesOfSpecies = ranges
)
return(k)
})
}
# Building the Answer
if (tidy_output) {
size <- length(namesToResolve)
if (size > 5) {
misspellingsData <- misspellingsData[1:5]
namesPercentageTable <- namesPercentageTable[1:5, ]
} else {
misspellingsData = misspellingsData[1:size]
namesPercentageTable <-
namesPercentageTable[1:size, ]
}
}
output <-
list(
percentageTable = percentageTable,
namesPercentageTable = namesPercentageTable,
possibleMisspellings = misspellingsData,
resolveResults = answer
)
message(Sys.time() - t)
return(output)
# End of Building the Answer
}
resolve_taxonrank(dat)
resolve_taxonrank(dat)
flagged_dat <- resolve_taxonrank(dat$data)
repeating_digits(dat)
format_checking <- function(GBIF_Data, variable_vector = NULL) {
class <- class(GBIF_Data)[1]
print(class(GBIF_Data))
if ( class == "dwca_gbif" | class == "gbif_data") {
GBIF_Data <- GBIF_Data$data$occurrence.txt
} else if (class != "data.frame" & class != "tbl_df") {
stop(paste("Incorrect input type, input a dataframe. Current type", class, sep = " "))
}
if (!is.null(variable_vector)) {
sapply(variable_vector, function(value) {
# print(value %in% colnames(GBIF_Data))
if (!(value %in% colnames(GBIF_Data))) {
stop(paste("Missing column", value, sep = " "))
}
})
}
return(GBIF_Data)
}
format_checking <- function(GBIF_Data, variable_vector = NULL) {
class <- class(GBIF_Data)[1]
print(class(GBIF_Data))
if ( class == "dwca_gbif") {
GBIF_Data <- GBIF_Data$data$occurrence.txt
}else if (class == "gbif_data"){
GBIF_Data <- GBIF_Data$data
}
} else if (class != "data.frame" & class != "tbl_df") {
stop(paste("Incorrect input type, input a dataframe. Current type", class, sep = " "))
}
format_checking <- function(GBIF_Data, variable_vector = NULL) {
class <- class(GBIF_Data)[1]
print(class(GBIF_Data))
if ( class == "dwca_gbif") {
GBIF_Data <- GBIF_Data$data$occurrence.txt
}else if (class == "gbif_data"){
GBIF_Data <- GBIF_Data$data
} else if (class != "data.frame" & class != "tbl_df") {
stop(paste("Incorrect input type, input a dataframe. Current type", class, sep = " "))
}
if (!is.null(variable_vector)) {
sapply(variable_vector, function(value) {
# print(value %in% colnames(GBIF_Data))
if (!(value %in% colnames(GBIF_Data))) {
stop(paste("Missing column", value, sep = " "))
}
})
}
return(GBIF_Data)
}
repeating_digits(dat)
flagged_dat <- repeating_digits(dat)
flagged_dat$latRepeatCount
flagged_dat <- repeating_digits(dat$data)
resolve_taxon_inspect <- function(GBIF_Data,
...) {
t <- Sys.time()
GBIF_Data <- format_checking(
GBIF_Data,
c(
"taxonRank",
"scientificName",
"decimalLatitude",
"decimalLongitude"
)
)
# --------- Subsetting unresolved Data -----------#
otherRankData <-
GBIF_Data[GBIF_Data$taxonRank != "SPECIES" &
GBIF_Data$taxonRank != "SUBSPECIES"
& GBIF_Data$taxonRank != "VARIETY",]
# --------- End of Subsetting unresolved Data -----------#
if (dim(otherRankData)[1] == 0) {
stop("No names to resolve")
}
# --------- Percentage Table -----------#
names <-
c(
"Total unresolved data",
"Unresolved genus data",
"Unresolved family data",
"Unresolved order data",
"Unresolved class data "
)
count <-
c(
NROW(otherRankData),
NROW(otherRankData[otherRankData$taxonRank == "GENUS",]),
NROW(otherRankData[otherRankData$taxonRank == "FAMILY",]),
NROW(otherRankData[otherRankData$taxonRank == "ORDER",]),
NROW(otherRankData[otherRankData$taxonRank == "CLASS",])
)
percentageTable <- data.frame(count, row.names = names)
total <- NROW(GBIF_Data)
unresolved <- NROW(otherRankData)
percentageTable$percentageOfTotalData <-
round(percentageTable$count / total * 100, 2)
percentageTable$percentageOfUnresolvedData <-
round(percentageTable$count / unresolved * 100, 2)
# --------- End of Percentage Table -----------#
# --------- Names Table -----------#
namesToResolve <-
sort(table(otherRankData$scientificName), decreasing = T) # Sorting the names by frequency,
# so that the names to be resolved first, are greater part of the complete data
count <-
as.vector(namesToResolve) # the count of records for each scientific names
taxon <- sapply(names(namesToResolve),  function(name) {
t <- otherRankData[otherRankData$scientificName == name,]
t <- t[1,]
t$taxonRank
})
namesPercentageTable <-
data.frame(
count,
percentOfUnresolvedData = round(count / unresolved * 100, 2),
currentTaxonRank = taxon,
row.names = (names(namesToResolve))
)
# --------- End of Names Table -----------#
string <-
paste(
"Your data has",
dim(namesPercentageTable)[1],
"names to resolve which will take roughly" ,
round((dim(namesPercentageTable)[1] * 7) / 60, 2),
"hours to finish on an average of 7 minutes per name. But it
might vary depending on your internet connection speed and various other factors"
)
output <-
list(
estimation = string,
percentageTable = percentageTable,
namesPercentageTable = namesPercentageTable
)
message(paste("Time difference of " , Sys.time() - t, " seconds", sep = ""))
return(output)
# End of Building the Answer
}
resolve_taxon_inspect(dat$data)
dat$data$taxonRank
format_checking <- function(GBIF_Data, variable_vector = NULL) {
class <- class(GBIF_Data)[1]
if ( class == "dwca_gbif") {
GBIF_Data <- GBIF_Data$data$occurrence.txt
}else if (class == "gbif_data"){
GBIF_Data <- GBIF_Data$data
} else if (class != "data.frame" & class != "tbl_df") {
stop(paste("Incorrect input type, input a dataframe. Current type", class, sep = " "))
}
if (!is.null(variable_vector)) {
sapply(variable_vector, function(value) {
# print(value %in% colnames(GBIF_Data))
if (!(value %in% colnames(GBIF_Data))) {
stop(paste("Missing column", value, sep = " "))
}
})
}
return(GBIF_Data)
}
resolve_taxon_inspect(dat$data)