Check datasets in package with utf-8, latin1 or bytes characters

Here is a hack of tools:::.check_package_datasets() to show what dataset has utf-8, latin1 or bytes characters.
pkgDir must be the path of the package.



  check_package_datasets <- function (pkgDir) 
  {
    oLC_ct <- Sys.getlocale("LC_CTYPE")
    on.exit(Sys.setlocale("LC_CTYPE", oLC_ct))
    Sys.setlocale("LC_CTYPE", "C")
    oop <- options(warn = -1)
    on.exit(options(oop), add = TRUE)
    check_one <- function(x, ds) {
      if (!length(x)) 
        return()
      if (is.list(x)) 
        lapply(unclass(x), check_one, ds = ds)
      if (is.character(x)) {
        xx <- unclass(x)
        enc <- Encoding(xx)
        latin1 <<- latin1 + sum(enc == "latin1")
        utf8 <<- utf8 + sum(enc == "UTF-8")
        bytes <<- bytes + sum(enc == "bytes")
        unk <- xx[enc == "unknown"]
        ind <- .Call(tools:::C_check_nonASCII2, unk)
        if (length(ind)) {
          non_ASCII <<- c(non_ASCII, unk[ind])
          where <<- c(where, rep.int(ds, length(ind)))
        }
      }
      a <- attributes(x)
      if (!is.null(a)) {
        lapply(a, check_one, ds = ds)
        check_one(names(a), ds)
      }
      invisible()
    }
    sink(tempfile())
    on.exit(sink(), add = TRUE)
    files <- tools:::list_files_with_type(file.path(pkgDir, "data"), 
                                          "data")
    files <- unique(basename(tools:::file_path_sans_ext(files)))
    ans <- vector("list", length(files))
    dataEnv <- new.env(hash = TRUE)
    names(ans) <- files
    old <- setwd(pkgDir)
    point_try <- function(expr, msg) {
      oop <- options(warn = 1)
      on.exit(options(oop))
      outConn <- file(open = "w+")
      sink(outConn, type = "output")
      sink(outConn, type = "message")
      tryCatch(withRestarts(withCallingHandlers(expr, error = {
        function(e) invokeRestart("grmbl", e, sys.calls())
      }), grmbl = function(e, calls) {
        n <- length(sys.calls())
        calls <- calls[-seq.int(length.out = n - 1L)]
        calls <- rev(calls)[-c(1L, 2L)]
        tb <- lapply(calls, deparse)
        message(msg, conditionMessage(e), "\nCall sequence:\n", 
                paste(c(utils::head(.eval_with_capture(traceback(tb))$output, 
                                    5), "  ..."), collapse = "\n"), "\n")
      }), error = identity, finally = {
        sink(type = "message")
        sink(type = "output")
        close(outConn)
      })
    }
    for (f in files) {
      msg <- sprintf("Error loading dataset %s: ", sQuote(f))
      point_try(utils::data(list = f, package = character(), envir = dataEnv), 
                msg)
    }
    setwd(old)
    non_ASCII <- where <- character()
    latin1 <- utf8 <- bytes <- 0L
    closeAllConnections()
    preutf8 <- 0
    prelatin1 <- 0
    prebytes <- 0
    for (ds in ls(envir = dataEnv, all.names = TRUE)) {
      
      if (inherits(suppressWarnings(suppressMessages(try(check_one(get(ds, 
                                                                       envir = dataEnv), ds), silent = TRUE))), "try-error")) {
        msg <- sprintf("Error loading dataset %s:\n ", sQuote(ds))
        message(msg, geterrmessage())
      }
      if (utf8-preutf8 != 0) print(paste0(ds, "  ", utf8-preutf8, " utf-8"))
      if (latin1-prelatin1 != 0) print(paste0(ds, "  ", latin1-prelatin1, " latin1"))
      if (bytes-prebytes != 0) print(paste0(ds, "  ", bytes-prebytes, " bytes"))
      preutf8 <- utf8
      prelatin1 <- latin1
      prebytes <- bytes
    }
    unknown <- unique(cbind(non_ASCII, where))
    structure(list(latin1 = latin1, utf8 = utf8, bytes = bytes, 
                   unknown = unknown), class = "check_package_datasets")
  }
  
  

Commentaires

Posts les plus consultés de ce blog

Standard error from Hessian Matrix... what can be done when problem occurs

Install treemix in ubuntu 20.04

stepAIC from package MASS with AICc