## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = F,
  warning = FALSE
)

## -----------------------------------------------------------------------------
# # Load RuHere package
# library(RuHere)

## ----eval = TRUE--------------------------------------------------------------
# Loading package occurrence data
data("occurrences", package = "RuHere")
# Number of records per species
table(occurrences$species)

## -----------------------------------------------------------------------------
# # Standardize country names
# occ_country_std <- standardize_countries(
#     occ = occurrences,
#     country_column = "country",
#     max_distance = 0.1,      # Maximum error distance for fuzzy matching
#     lookup_na_country = TRUE # Try to extract country from coords if value is
#     # NA using the country_from_coords() function internally
# )

## -----------------------------------------------------------------------------
# # Printing first rows and columns
# occ_country_std$occ[1:3, 1:5]
# #>   country country_suggested country_source  record_id               species
# #> 1      AR         argentina       metadata  gbif_5516  Araucaria angustifolia
# #> 2      AR         argentina       metadata gbif_15849  Araucaria angustifolia
# #> 3      AR         argentina       metadata  gbif_4935  Araucaria angustifolia
# 
# occ_country_std$report[1:5, ]
# #>      country country_suggested
# #> 1  argentina         argentina
# #> 2    bolivia           bolivia
# #> 3     brasil            brazil
# #> 4         UY           uruguay
# #> 5         PT          portugal

## -----------------------------------------------------------------------------
# # Standardize state names
# occ_state_std <- standardize_states(
#     occ = occ_country_std$occ,
#     state_column = "stateProvince",
#     country_column = "country_suggested",
#     max_distance = 0.1,
#     lookup_na_state = TRUE # Try to extract state from coords if value is NA
# )

## -----------------------------------------------------------------------------
# occ_state_std$occ[1:3, 1:6]
# #>   stateProvince state_suggested state_source country_suggested country country_source
# #> 1          acre            acre     metadata            brazil  brazil       metadata
# #> 2          acre            acre     metadata            brazil  brazil       metadata
# #> 3          acre            acre     metadata            brazil  brazil       metadata
# 
# occ_state_std$report[1:3, ]
# #>       stateProvince           state_suggested  country_suggested
# #> 1        sa£o paulo                 sao paulo             brazil
# #> 2         tocantins                 tocantins             brazil
# #> 3               RS          rio grande do sul             brazil

## -----------------------------------------------------------------------------
# # Explicitly extract country from coordinates for all records
# occ_with_country_xy <- country_from_coords(
#     occ = occ_state_std$occ,
#     from = "all", # 'all' extracts for every record; 'na_only' extracts for missing ones
#     output_column = "country_xy"
# )
# 
# # Compare the original country vs. the one derived from coordinates
# head(occ_with_country_xy[, c("country", "country_xy")])
# #>   country country_xy
# #> 1  brazil     brazil
# #> 2  brazil     brazil
# #> 3  brazil     brazil
# #> 4      BR     brazil
# #> 5      BR     brazil
# #> 6      BR     brazil

## -----------------------------------------------------------------------------
# # Extract state from coordinates for all records
# occ_imputed <- states_from_coords(
#     occ = occ_with_country_xy,
#     from = "all",
#     state_column = "stateProvince",
#     output_column = "state_xy"
# )
# 
# head(occ_imputed[, c("stateProvince", "state_xy", "state_source")])
# #>   stateProvince state_xy state_source
# #> 1          acre     acre     metadata
# #> 2          acre     acre     metadata
# #> 3          acre     acre     metadata
# #> 4          acre amazonas     metadata
# #> 5          acre     acre     metadata
# #> 6          acre     acre     metadata

## -----------------------------------------------------------------------------
# # Check if coordinates fall within the assigned country
# occ_checked_country <- check_countries(
#     occ = occ_imputed,
#     country_column = "country_suggested",
#     distance = 5,      # Allows a 5 km buffer for border points
#     try_to_fix = TRUE  # Automatically attempts to fix inverted/swapped coordinates
# )
# #> Testing countries...
# #> 468 records fall in wrong countries
# #> Task 1 of 7: testing if longitude is inverted
# #> 0 coordinates with longitude inverted
# #> Task 2 of 7: testing if latitude is inverted
# #> 0 coordinates with latitude inverted
# #> Task 3 of 7: testing if longitude and latitude are inverted
# #> 2 coordinates with longitude and latitude inverted
# #> Task 4 of 7: testing if longitude and latitude are swapped
# #> 1 coordinates with longitude and latitude swapped
# #> Task 5 of 7: testing if longitude and latitude are swapped with longitude inverted
# #> 0 coordinates with longitude and latitude swapped and latitude inverted
# #> Task 6 of 7: testing if longitude and latitude are swapped - with latitude inverted
# #> 0 coordinates with longitude and latitude swapped and longitude inverted
# #> Task 7 of 7: testing if longitude and latitude are swapped - with longitude latitude inverted
# #> 0 coordinates with longitude and latitude swapped and inverted
# 
# # The 'correct_country' column indicates validity
# head(occ_checked_country[, c("country_suggested", "correct_country", "country_issues")])
# #>   country_suggested correct_country country_issues
# #> 1            brazil            TRUE        correct
# #> 2            brazil            TRUE        correct
# #> 3            brazil            TRUE        correct
# #> 4            brazil            TRUE        correct
# #> 5            brazil            TRUE        correct
# #> 6            brazil            TRUE        correct

## -----------------------------------------------------------------------------
# # Check if coordinates fall within the assigned state
# occ_checked_state <- check_states(
#     occ = occ_checked_country,
#     state_column = "state_suggested",
#     distance = 5,
#     try_to_fix = FALSE # We just want to flag issues here, not auto-fix
# )
# #> Testing states...
# #> 87 records fall in wrong states
# 
# head(occ_checked_state[, c("state_suggested", "correct_state")])
# #>   state_suggested correct_state
# #> 1            acre          TRUE
# #> 2            acre          TRUE
# #> 3            acre          TRUE
# #> 4            acre         FALSE
# #> 5            acre          TRUE
# #> 6            acre          TRUE

## -----------------------------------------------------------------------------
# # This step is only necessary if you did NOT set try_to_fix = TRUE above
# fixing_example <- fix_countries(
#    occ = occ_checked_country,
#    country_column = "country_suggested",
#    correct_country = "correct_country" # Column created by check_countries
# )
# #> Task 1 of 7: testing if longitude is inverted
# #> 0 coordinates with longitude inverted
# #> Task 2 of 7: testing if latitude is inverted
# #> 0 coordinates with latitude inverted
# #> Task 3 of 7: testing if longitude and latitude are inverted
# #> 0 coordinates with longitude and latitude inverted
# #> Task 4 of 7: testing if longitude and latitude are swapped
# #> 0 coordinates with longitude and latitude swapped
# #> Task 5 of 7: testing if longitude and latitude are swapped with longitude inverted
# #> 0 coordinates with longitude and latitude swapped and latitude inverted
# #> Task 6 of 7: testing if longitude and latitude are swapped - with latitude inverted
# #> 0 coordinates with longitude and latitude swapped and longitude inverted
# #> Task 7 of 7: testing if longitude and latitude are swapped - with longitude latitude inverted
# #> 0 coordinates with longitude and latitude swapped and inverted

