--- title: "ksformat Usage Examples" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{ksformat Usage Examples} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r setup, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) library(ksformat) ```

The **ksformat** package provides SAS PROC FORMAT-like functionality for R. This vignette walks through the most common use cases. ## Example 1: Basic Discrete Formatting Create a format for gender codes (auto-stored in library as "sex"): ```{r discrete} fnew( "M" = "Male", "F" = "Female", .missing = "Unknown", .other = "Other Gender", name = "sex" ) gender_codes <- c("M", "F", "M", NA, "X", "F") formatted_genders <- fput(gender_codes, "sex") data.frame( code = gender_codes, label = formatted_genders ) fprint("sex") ``` ## Example 2: Numeric Range Formatting Define formats in SAS-like text (auto-registered): ```{r ranges} fparse(text = ' VALUE age (numeric) [0, 18) = "Child" [18, 65) = "Adult" [65, HIGH] = "Senior" .missing = "Age Unknown" ; ') ages <- c(5, 15.3, 17.9, 18, 45, 64.99, 65, 85, NA) age_groups <- fputn(ages, "age") data.frame( age = ages, group = age_groups ) ``` ## Example 3: Decimal Ranges (BMI Categories) ```{r bmi} fparse(text = ' VALUE bmi (numeric) [0, 18.5) = "Underweight" [18.5, 25) = "Normal" [25, 30) = "Overweight" [30, HIGH] = "Obese" .missing = "No data" ; ') bmi_values <- c(16.2, 18.5, 22.7, 25, 29.9, 35.1, NA) bmi_labels <- fputn(bmi_values, "bmi") data.frame( bmi = bmi_values, category = bmi_labels ) ``` ## Example 4: Exclusive/Inclusive Bounds ```{r bounds} fparse(text = ' VALUE score (numeric) (0, 50] = "Low" (50, 100] = "High" .other = "Out of range" ; ') scores <- c(0, 1, 50, 51, 100, 101) score_labels <- fputn(scores, "score") data.frame( score = scores, label = score_labels ) ``` ## Example 5: Reverse Formatting with Invalue Invalues convert labels back to values. The default `target_type` is `"numeric"`: ```{r invalue} finput( "Male" = 1, "Female" = 2, name = "sex_inv" ) labels <- c("Male", "Female", "Male", "Unknown", "Female") codes <- finputn(labels, "sex_inv") data.frame( label = labels, code = codes ) ``` ## Example 6: Bidirectional Formatting `fnew_bid()` creates both a format and an invalue at once: ```{r bidirectional} status_bi <- fnew_bid( "A" = "Active", "I" = "Inactive", "P" = "Pending", name = "status" ) # Forward: code -> label status_codes <- c("A", "I", "P", "A") status_labels <- fputc(status_codes, "status") data.frame(code = status_codes, label = status_labels) # Reverse: label -> code test_labels <- c("Active", "Pending", "Inactive") test_codes <- finputc(test_labels, "status_inv") data.frame(label = test_labels, code = test_codes) ``` ## Example 7: Parse Multiple Formats from Text ```{r multiparse} fparse(text = ' // Study format definitions VALUE race (character) "W" = "White" "B" = "Black" "A" = "Asian" .missing = "Unknown" ; INVALUE race_inv "White" = 1 "Black" = 2 "Asian" = 3 ; ') fprint() ``` ## Example 8: Export Formats Back to Text ```{r export} bmi_fmt <- format_get("bmi") cat(fexport(bmi = bmi_fmt)) ``` ## Example 9: SAS-like PUT/INPUT Functions ```{r sas-put-input} # fputn — apply numeric format by name fputn(c(5, 30, 70), "age") # fputc — apply character format by name fputc(c("M", "F"), "sex") # finputn — apply numeric invalue by name finputn(c("White", "Black"), "race_inv") ``` ## Example 10: Data Frame Formatting ```{r df-format} df <- data.frame( id = 1:6, sex = c("M", "F", "M", "F", NA, "X"), age = c(15, 25, 45, 70, 35, NA), stringsAsFactors = FALSE ) sex_f <- format_get("sex") age_f <- format_get("age") df_formatted <- fput_df( df, sex = sex_f, age = age_f, suffix = "_label" ) df_formatted ``` ## Example 11: Missing Value Handling ```{r missing} # With .missing label fput(c("M", "F", NA), "sex") # With keep_na = TRUE fput(c("M", "F", NA), sex_f, keep_na = TRUE) # is_missing() checks is_missing(NA) is_missing(NaN) is_missing("") # TRUE — empty strings are treated as missing ``` ## Example 12: Date/Time Formats (SAS-style) ### SAS Date Formats SAS date format names are auto-resolved — no pre-creation needed: ```{r date-formats} today <- Sys.Date() data.frame( format = c("DATE9.", "MMDDYY10.", "DDMMYY10.", "YYMMDD10.", "MONYY7.", "WORDDATE.", "YEAR4.", "QTR."), result = c( fputn(today, "DATE9."), fputn(today, "MMDDYY10."), fputn(today, "DDMMYY10."), fputn(today, "YYMMDD10."), fputn(today, "MONYY7."), fputn(today, "WORDDATE."), fputn(today, "YEAR4."), fputn(today, "QTR.") ) ) # Multiple dates dates <- as.Date(c("2020-01-15", "2020-06-30", "2020-12-25")) fputn(dates, "DATE9.") ``` ### R Numeric Dates (Days Since 1970-01-01) ```{r date-numeric} r_days <- as.numeric(as.Date("2025-01-01")) r_days fputn(r_days, "DATE9.") fputn(r_days, "MMDDYY10.") ``` ### Time Formats Time is represented as seconds since midnight: ```{r time-formats} seconds <- c(0, 3600, 45000, 86399) data.frame( seconds = seconds, TIME8 = fputn(seconds, "TIME8."), TIME5 = fputn(seconds, "TIME5."), HHMM = fputn(seconds, "HHMM.") ) ``` ### Datetime Formats ```{r datetime-formats} now <- Sys.time() data.frame( format = c("DATETIME20.", "DATETIME13.", "DTDATE.", "DTYYMMDD."), result = c( fputn(now, "DATETIME20."), fputn(now, "DATETIME13."), fputn(now, "DTDATE."), fputn(now, "DTYYMMDD.") ) ) # From numeric R-epoch seconds r_secs <- as.numeric(as.POSIXct("2025-06-15 14:30:00", tz = "UTC")) fputn(r_secs, "DATETIME20.") ``` ### Custom Date Formats with `fnew_date()` ```{r fnew-date} # SAS-named format fnew_date("DATE9.", name = "bday_fmt") birthdays <- as.Date(c("1990-03-25", "1985-11-03", "2000-07-14")) fput(birthdays, "bday_fmt") # Custom strftime pattern (e.g. DD.MM.YYYY) fnew_date("%d.%m.%Y", name = "ru_date", type = "date") fput(birthdays, "ru_date") # Custom pattern with missing label fnew_date("MMDDYY10.", name = "us_date", .missing = "NO DATE") mixed <- c(as.Date("2025-01-01"), NA, as.Date("2025-12-31")) fput(mixed, "us_date") fprint("bday_fmt") ``` ### Date Formats in Data Frames ```{r date-df} patients <- data.frame( id = 1:4, visit_date = as.Date(c("2025-01-10", "2025-02-15", "2025-03-20", NA)), stringsAsFactors = FALSE ) visit_fmt <- fnew_date("DATE9.", name = "visit_fmt", .missing = "NOT RECORDED") fput_df(patients, visit_date = visit_fmt) ``` ### Parse Date Formats from Text ```{r date-parse} fparse(text = ' VALUE enrldt (date) pattern = "DATE9." .missing = "Not Enrolled" ; VALUE visit_time (time) pattern = "TIME8." ; VALUE stamp (datetime) pattern = "DATETIME20." ; ') fput(as.Date("2025-03-01"), "enrldt") fput(36000, "visit_time") fput(as.POSIXct("2025-03-01 10:00:00", tz = "UTC"), "stamp") # Export back to text enrl_obj <- format_get("enrldt") cat(fexport(enrldt = enrl_obj)) fclear() ``` ## Example 13: Multilabel Formats ### Overlapping Age Categories With multilabel formats, a single value can match multiple labels: ```{r multilabel-basic} fnew( "0,5,TRUE,TRUE" = "Infant", "6,11,TRUE,TRUE" = "Child", "12,17,TRUE,TRUE" = "Adolescent", "0,17,TRUE,TRUE" = "Pediatric", "18,64,TRUE,TRUE" = "Adult", "65,Inf,TRUE,TRUE" = "Elderly", "18,Inf,TRUE,TRUE" = "Non-Pediatric", name = "age_categories", type = "numeric", multilabel = TRUE ) ages <- c(3, 14, 25, 70) # fput returns first match only fput(ages, "age_categories") # fput_all returns ALL matching labels all_labels <- fput_all(ages, "age_categories") for (i in seq_along(ages)) { cat("Age", ages[i], "->", paste(all_labels[[i]], collapse = ", "), "\n") } ``` ### Multilabel with Missing Values ```{r multilabel-missing} fnew( "0,100,TRUE,TRUE" = "Valid Score", "0,49,TRUE,TRUE" = "Below Average", "50,100,TRUE,TRUE" = "Above Average", "90,100,TRUE,TRUE" = "Excellent", .missing = "No Score", .other = "Out of Range", name = "score_ml", type = "numeric", multilabel = TRUE ) scores <- c(95, 45, NA, 150) ml_result <- fput_all(scores, "score_ml") for (i in seq_along(scores)) { cat("Score", ifelse(is.na(scores[i]), "NA", scores[i]), "->", paste(ml_result[[i]], collapse = ", "), "\n") } ``` ### Parse Multilabel from Text ```{r multilabel-parse} fparse(text = ' VALUE risk (numeric, multilabel) [0, 3] = "Low Risk" [0, 7] = "Monitored" (3, 7] = "Medium Risk" (7, 10] = "High Risk" ; ') risk_scores <- c(2, 5, 9) risk_labels <- fput_all(risk_scores, "risk") for (i in seq_along(risk_scores)) { cat("Score", risk_scores[i], "->", paste(risk_labels[[i]], collapse = " | "), "\n") } ``` ### Multilabel Export ```{r multilabel-export} risk_obj <- format_get("risk") cat(fexport(risk = risk_obj)) fprint("risk") ``` ### Practical Example: Adverse Event Severity Grading ```{r ae-grading} fnew( "1,1,TRUE,TRUE" = "Mild", "2,2,TRUE,TRUE" = "Moderate", "3,3,TRUE,TRUE" = "Severe", "4,4,TRUE,TRUE" = "Life-threatening", "5,5,TRUE,TRUE" = "Fatal", "3,5,TRUE,TRUE" = "Serious", "1,2,TRUE,TRUE" = "Non-serious", name = "ae_grade", type = "numeric", multilabel = TRUE ) grades <- c(1, 2, 3, 4, 5) ae_labels <- fput_all(grades, "ae_grade") for (i in seq_along(grades)) { cat("Grade", grades[i], ":", paste(ae_labels[[i]], collapse = " + "), "\n") } fclear() ``` ## Example 14: Case-Insensitive Matching ```{r nocase} sex_nc <- fnew( "M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex_nc", type = "character", ignore_case = TRUE ) input <- c("m", "F", "M", "f", NA) fput(input, sex_nc) # Note the [nocase] flag fprint("sex_nc") # Also works with fputc fputc("m", "sex_nc") fclear() ``` ## Example 15: Expression Labels in Formats Expression labels contain `.x1`, `.x2`, etc., which reference extra arguments passed to `fput()`. This lets you compute labels dynamically. ### Simple `sprintf` Expression ```{r expr-sprintf} stat_fmt <- fnew( "n" = "sprintf('%s', .x1)", "pct" = "sprintf('%.1f%%', .x1 * 100)", name = "stat", type = "character" ) types <- c("n", "pct", "n", "pct") values <- c(42, 0.053, 100, 0.255) fput(types, stat_fmt, values) ``` ### Two Extra Arguments (`.x1`, `.x2`) ```{r expr-twoargs} ratio_fmt <- fnew( "ratio" = "sprintf('%s/%s', .x1, .x2)", name = "ratio", type = "character" ) fput("ratio", ratio_fmt, 3, 10) fput(c("ratio", "ratio"), ratio_fmt, c(3, 7), c(10, 20)) ``` ### `ifelse` Expression ```{r expr-ifelse} sign_fmt <- fnew( "val" = "ifelse(.x1 > 0, paste0('+', .x1), as.character(.x1))", name = "sign", type = "character" ) nums <- c(5, 0, -3) fput(rep("val", 3), sign_fmt, nums) ``` ### Mixed Static and Expression Labels ```{r expr-mixed} mixed_fmt <- fnew( "header" = "HEADER", "n" = "sprintf('N=%s', .x1)", "pct" = "sprintf('%.1f%%', .x1 * 100)", name = "mixed", type = "character" ) keys <- c("header", "n", "pct", "header", "n") vals <- c(0, 42, 0.15, 0, 100) fput(keys, mixed_fmt, vals) ``` ### Expression in `.other` Fallback ```{r expr-other} known_fmt <- fnew( "ok" = "OK", .other = "sprintf('Error(%s)', .x1)", name = "err_fmt", type = "character" ) codes <- c("ok", "E01", "ok", "E99") details <- c("", "timeout", "", "overflow") fput(codes, known_fmt, details) ``` ### Scalar Recycling ```{r expr-recycle} label_fmt <- fnew( "val" = "sprintf('%s (N=%s)', .x1, .x2)", name = "recycle", type = "character" ) fput(c("val", "val"), label_fmt, c(42, 55), 100) ``` ### Statistical Table Format with Computed Labels A realistic clinical-trial example: `e()` marks labels as expressions evaluated at apply-time, `.x1` references the extra argument, and multiline `dplyr::case_when` shows complex conditional formatting. ```{r expr-stat-fnew} # Population counts used as denominators n.trt <- data.frame(pop = c("fas","pps","saf"), ntot = c(34, 30, 36)) get_n <- function(pop) { n.trt$ntot[n.trt$pop == pop] } fnew( "n_fas" = e("get_n('fas')"), "n_pps" = e("get_n('pps')"), "n_saf" = e("get_n('saf')"), "n" = "sprintf('%d', .x1)", "n_pct_fas" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('fas'))", "n_pct_pps" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('pps'))", "n_pct_saf" = "sprintf('%d (%5.1f%%)', .x1, .x1 * 100 / get_n('saf'))", "pct" = "dplyr::case_when( .x1>0 & .x1<0.1 ~ sprintf('%5s', ' <0.1%'), .x1>=0.1 | .x1==0 ~ sprintf(paste0('%5.', 1 ,'f%%'), .x1) )", "pval" = "dplyr::case_when( .x1>=0 & .x1<0.001 ~ sprintf('%s', '<0.001'), .x1>=0.001 & .x1<=0.999 ~ sprintf(paste0('%.', 3 ,'f'), .x1), .x1>0.999 ~ sprintf('%s', '>0.999'), .default = '--' )", name = "stat", type = "character" ) ``` The same format can be created via `fparse()`. Note that multiline expressions must be collapsed to single lines in the text block, and `(eval)` marks evaluated labels: ```{r expr-stat-fparse} fmt <- ' VALUE stat_01 (character) "n_fas" = "get_n(\'fas\')" (eval) "n_pps" = "get_n(\'pps\')" (eval) "n_saf" = "get_n(\'saf\')" (eval) "n" = "sprintf(\'%d\', .x1)" "pct" = "dplyr::case_when(.x1>0 & .x1<0.1 ~ sprintf(\'%5s\', \' <0.1%\'), .x1>=0.1 | .x1==0 ~ sprintf(paste0(\'%5.\', 1 ,\'f%%\'), .x1))" "n_pct_fas" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'fas\'))" "n_pct_pps" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'pps\'))" "n_pct_saf" = "sprintf(\'%d (%5.1f%%)\', .x1, .x1 * 100 / get_n(\'saf\'))" "pval" = "dplyr::case_when(.x1>=0 & .x1<0.001 ~ sprintf(\'%s\', \'<0.001\'), .x1>=0.001 & .x1<=0.999 ~ sprintf(paste0(\'%.\', 3 ,\'f\'), .x1), .x1>0.999 ~ sprintf(\'%s\', \'>0.999\'), .default = \'--\')" ;' fparse(fmt) ``` Both `stat` (via `fnew`) and `stat_01` (via `fparse`) produce identical results: ```{r expr-stat-apply} df <- data.frame( types = c("n_fas", "n_pps", "n_saf", "n", "pct", "pct", "n", "pval", "pval", "n_pct_fas", "n_pct_pps", "n_pct_saf"), values = c(NA, NA, NA, 42, 0.053, 0.0008, 100, 0.255, 0.0003, 22, 22, 22) ) df$fmt <- fput(df$types, "stat", df$values) df$fmt_01 <- fput(df$types, "stat_01", df$values) print(df) fclear() ``` ## Example 16: Vectorized Format Names (SAS PUTC-style) Each element can use a different format, determined by a vector of format names: ```{r vectorized} # Dispatch format: maps type code to format name fnew("1" = "groupx", "2" = "groupy", "3" = "groupz", name = "typefmt", type = "numeric") # Per-group character formats fnew("positive" = "agree", "negative" = "disagree", "neutral" = "notsure", name = "groupx", type = "character") fnew("positive" = "accept", "negative" = "reject", "neutral" = "possible", name = "groupy", type = "character") fnew("positive" = "pass", "negative" = "fail", "neutral" = "retest", name = "groupz", type = "character") type <- c(1, 1, 1, 2, 2, 2, 3, 3, 3) response <- c("positive", "negative", "neutral", "positive", "negative", "neutral", "positive", "negative", "neutral") # Step 1: map type -> format name respfmt <- fput(type, "typefmt") # Step 2: apply per-element format word <- fputc(response, respfmt) data.frame(type = type, response = response, respfmt = respfmt, word = word) fclear() ``` ## Example 17: Working with Dates and Formats — PUTN A SAS-style workflow where format names are looked up dynamically per observation: ```{r dates-putn} # Format that maps key codes to date format names fnew("1" = "date9.", "2" = "mmddyy10.", name = "writfmt", type = "numeric") fnew_date("date9.") fnew_date("mmddyy10.") # Input data (R date numbers = days since 1970-01-01) number <- c(12103, 10899) key <- c(1, 2) # Look up format name per observation datefmt <- fputn(key, "writfmt") # Apply per-element date format date <- fputn(number, datefmt) data.frame(number = number, key = key, datefmt = datefmt, date = date) fclear() ``` ## Example 18: Import SAS Formats from CNTLOUT CSV The `fimport()` function reads a CSV file exported from a SAS format catalogue (`PROC FORMAT ... CNTLOUT=`): ```{r cntlout-import} csv_path <- system.file("extdata", "test_cntlout.csv", package = "ksformat") ``` ```{r cntlout-use} imported <- fimport(csv_path) names(imported) fprint() ``` ### Use Imported Formats ```{r cntlout-apply} # Character format (GENDER) gender_codes <- c("M", "F", NA, "X") data.frame( code = gender_codes, label = fputc(gender_codes, "GENDER") ) # Numeric format (AGEGRP) ages <- c(5, 17, 18, 45, 65, 100, NA, -1) data.frame( age = ages, group = fputn(ages, "AGEGRP") ) # Numeric format (BMICAT) bmi_values <- c(15.0, 18.5, 22.3, 25.0, 28.7, 30.0, 35.5) data.frame( bmi = bmi_values, category = fputn(bmi_values, "BMICAT") ) # Invalue (RACEIN) race_labels <- c("White", "Black", "Asian", "Other") data.frame( label = race_labels, code = finputn(race_labels, "RACEIN") ) ``` ### Apply to Data Frame ```{r cntlout-df} df <- data.frame( id = 1:5, sex = c("M", "F", "M", NA, "F"), age = c(10, 30, 70, NA, 50), stringsAsFactors = FALSE ) gender_fmt <- imported[["GENDER"]] age_fmt <- imported[["AGEGRP"]] fput_df(df, sex = gender_fmt, age = age_fmt, suffix = "_label") ``` ### Export Imported Format ```{r cntlout-export} cat(fexport(AGEGRP = age_fmt)) cat(fexport(GENDER = gender_fmt)) ``` ### Selective Import (No Auto-register) ```{r cntlout-manual} fclear() manual <- fimport(csv_path, register = FALSE) # Library should be empty fprint() # Use directly from returned list fput(c("M", "F"), manual[["GENDER"]]) fclear() ``` ## Example 19: Bilingual Format Expression labels can select between languages at apply-time using an extra argument: ```{r bilingual} # Single format, language selected via .x1 extra argument sex_bi <- fnew( "M" = "ifelse(.x1 == 'en', 'Male', 'Homme')", "F" = "ifelse(.x1 == 'en', 'Female', 'Femme')", .missing = "Unknown", name = "sex_bi" ) # .x1 = language code per observation fput(c("M", "F", "M"), sex_bi, c("en", "fr", "en")) # -> "Male" "Femme" "Male" # Alternative: one format per language, selected at apply-time fnew("M" = "Male", "F" = "Female", .missing = "Unknown", name = "sex_en") fnew("M" = "Homme", "F" = "Femme", .missing = "Inconnu", name = "sex_fr") lang <- "fr" fput(c("M", "F", NA), paste0("sex_", lang)) # -> "Homme" "Femme" "Inconnu" fclear() ```