lyscripts.data.lyproxify
Consumes raw data and transforms it into a CSV of the format that [LyProX] understands.
To do so, it needs a dictionary that defines a mapping from raw columns to the LyProX
style data format. See the documentation of the transform_to_lyprox
function for
more information.
1""" 2Consumes raw data and transforms it into a CSV of the format that [LyProX] understands. 3 4To do so, it needs a dictionary that defines a mapping from raw columns to the LyProX 5style data format. See the documentation of the `transform_to_lyprox` function for 6more information. 7 8[LyProX]: https://lyprox.org 9""" 10# pylint: disable=logging-fstring-interpolation 11import argparse 12import importlib.util 13import logging 14import warnings 15from pathlib import Path 16from typing import Any 17 18import pandas as pd 19 20from lyscripts.data.utils import save_table_to_csv 21from lyscripts.decorators import log_state 22from lyscripts.utils import delete_private_keys, flatten, load_patient_data 23 24warnings.simplefilter(action="ignore", category=FutureWarning) 25 26 27logger = logging.getLogger(__name__) 28 29 30def _add_parser( 31 subparsers: argparse._SubParsersAction, 32 help_formatter, 33): 34 """ 35 Add an `ArgumentParser` to the subparsers action. 36 """ 37 parser = subparsers.add_parser( 38 Path(__file__).name.replace(".py", ""), 39 description=__doc__, 40 help=__doc__, 41 formatter_class=help_formatter, 42 ) 43 _add_arguments(parser) 44 45 46def _add_arguments(parser: argparse.ArgumentParser): 47 """ 48 Add arguments needed to run this script to a `subparsers` instance 49 and run the respective main function when chosen. 50 """ 51 parser.add_argument( 52 "-i", "--input", type=Path, required=True, 53 help="Location of raw CSV data." 54 ) 55 parser.add_argument( 56 "-r", "--header-rows", nargs="+", default=[0], type=int, 57 help="List with header row indices of raw file." 58 ) 59 parser.add_argument( 60 "-o", "--output", type=Path, required=True, 61 help="Location to store the lyproxified CSV file." 62 ) 63 parser.add_argument( 64 "-m", "--mapping", type=Path, required=True, 65 help=( 66 "Location of the Python file that contains column mapping instructions. " 67 "This must contain a dictionary with the name 'column_map'." 68 ) 69 ) 70 parser.add_argument( 71 "--drop-rows", nargs="+", type=int, default=[], 72 help=( 73 "Delete rows of specified indices. Counting of rows start at 0 _after_ " 74 "the `header-rows`." 75 ) 76 ) 77 parser.add_argument( 78 "--drop-cols", nargs="+", type=int, default=[], 79 help="Delete columns of specified indices.", 80 ) 81 parser.add_argument( 82 "--add-index", action="store_true", 83 help="If the data doesn't contain an index, add one by enumerating the patients" 84 ) 85 86 parser.set_defaults(run_main=main) 87 88 89class ParsingError(Exception): 90 """Error while parsing the CSV file.""" 91 92 93def clean_header( 94 table: pd.DataFrame, 95 num_cols: int, 96 num_header_rows: int, 97) -> pd.DataFrame: 98 """Rename the header cells in the `table`.""" 99 for col in range(num_cols): 100 for row in range(num_header_rows): 101 table.rename( 102 columns={f"Unnamed: {col}_level_{row}": f"{col}_lvl_{row}"}, 103 inplace=True, 104 ) 105 return table 106 107 108def get_instruction_depth(nested_column_map: dict[tuple, dict[str, Any]]) -> int: 109 """ 110 Get the depth at which the column mapping instructions are nested. 111 112 Instructions are a dictionary that contains either a 'func' or 'default' key. 113 114 Example: 115 >>> nested_column_map = {"patient": {"age": {"func": int}}} 116 >>> get_instruction_depth(nested_column_map) 117 2 118 >>> flat_column_map = flatten(nested_column_map, max_depth=2) 119 >>> get_instruction_depth(flat_column_map) 120 1 121 >>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}} 122 >>> get_instruction_depth(nested_column_map) 123 Traceback (most recent call last): 124 ... 125 ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key. 126 """ 127 for _, value in nested_column_map.items(): 128 if isinstance(value, dict): 129 if "func" in value or "default" in value: 130 return 1 131 132 return 1 + get_instruction_depth(value) 133 134 raise ValueError( 135 "Leaf of column map must be a dictionary with 'func' or 'default' key." 136 ) 137 138 139def generate_markdown_docs( 140 nested_column_map: dict[tuple, dict[str, Any]], 141 depth: int = 0, 142 indent_len: int = 4, 143) -> str: 144 """ 145 Generate a markdown nested, ordered list as documentation for the column map. 146 147 A key in the doctionary is supposed to be documented, when its value is a dictionary 148 containing a `"__doc__"` key. 149 150 Example: 151 >>> nested_column_map = { 152 ... "patient": { 153 ... "__doc__": "some patient info", 154 ... "age": { 155 ... "__doc__": "age of the patient", 156 ... "func": int, 157 ... "columns": ["age"], 158 ... }, 159 ... }, 160 ... } 161 >>> generate_markdown_docs(nested_column_map) 162 '1. **`patient:`** some patient info\\n 1. **`age:`** age of the patient\\n' 163 """ 164 md_docs = "" 165 indent = " " * indent_len * depth 166 i = 1 167 for key, value in nested_column_map.items(): 168 if isinstance(value, dict): 169 if "__doc__" in value: 170 md_docs += f"{indent}{i}. **`{key}:`** {value['__doc__']}\n" 171 i += 1 172 173 md_docs += generate_markdown_docs(value, depth + 1, indent_len) 174 175 return md_docs 176 177 178@log_state() 179def transform_to_lyprox( 180 raw: pd.DataFrame, 181 column_map: dict[tuple, dict[str, Any]] 182) -> pd.DataFrame: 183 """ 184 Transform `raw` data frame into table that can be uploaded directly to [LyProX]. 185 186 To do so, it uses instructions in the `colum_map` dictionary, that needs to have 187 a particular structure: 188 189 For each column in the final 'lyproxified' `pd.DataFrame`, one entry must exist in 190 the `column_map` dctionary. E.g., for the column corresponding to a patient's age, 191 the dictionary should contain a key-value pair of this shape: 192 193 ```python 194 column_map = { 195 ("patient", "#", "age"): { 196 "func": compute_age_from_raw, 197 "kwargs": {"randomize": False}, 198 "columns": ["birthday", "date of diagnosis"] 199 }, 200 } 201 ``` 202 203 In this example, the function `compute_age_from_raw` is called with the values of 204 the columns `birthday` and `date of diagnosis` as positional arguments, and the 205 keyword argument `randomize` is set to `False`. The function then returns the 206 patient's age, which is subsequently stored in the column `("patient", "#", "age")`. 207 208 Note that the `column_map` dictionary must have either a `default` key or `func` 209 along with `columns` and `kwargs`, depending on the function definition. If the 210 function does not take any arguments, `columns` can be omitted. If it also does 211 not take any keyword arguments, `kwargs` can be omitted, too. 212 213 [LyProX]: https://lyprox.org 214 """ 215 column_map = delete_private_keys(column_map) 216 217 if (instruction_depth := get_instruction_depth(column_map)) > 1: 218 column_map = flatten(column_map, max_depth=instruction_depth) 219 220 multi_idx = pd.MultiIndex.from_tuples(column_map.keys()) 221 processed = pd.DataFrame(columns=multi_idx) 222 223 for multi_idx_col, instruction in column_map.items(): 224 if instruction != "": 225 if "default" in instruction: 226 processed[multi_idx_col] = [instruction["default"]] * len(raw) 227 elif "func" in instruction: 228 cols = instruction.get("columns", []) 229 kwargs = instruction.get("kwargs", {}) 230 func = instruction["func"] 231 232 try: 233 processed[multi_idx_col] = [ 234 func(*vals, **kwargs) for vals in raw[cols].values 235 ] 236 except Exception as exc: 237 raise ParsingError( 238 f"Exception encountered while parsing column {multi_idx_col}" 239 ) from exc 240 else: 241 raise ParsingError( 242 f"Column {multi_idx_col} has neither a `default` value nor `func` " 243 "describing how to fill this column." 244 ) 245 return processed 246 247 248@log_state() 249def leftright_to_ipsicontra(data: pd.DataFrame): 250 """ 251 Change absolute side reporting to tumor-relative. 252 253 Transform reporting of LNL involvement by absolute side (right & left) to a 254 reporting relative to the tumor (ipsi- & contralateral). The table `data` should 255 already be in the format LyProX requires, except for the side-reporting of LNL 256 involvement. 257 """ 258 len_before = len(data) 259 left_data = data.loc[ 260 data["tumor", "1", "side"] != "right" 261 ] 262 right_data = data.loc[ 263 data["tumor", "1", "side"] == "right" 264 ] 265 266 left_data = left_data.rename(columns={"left": "ipsi"}, level=1) 267 left_data = left_data.rename(columns={"right": "contra"}, level=1) 268 right_data = right_data.rename(columns={"left": "contra"}, level=1) 269 right_data = right_data.rename(columns={"right": "ipsi"}, level=1) 270 271 data = pd.concat( 272 [left_data, right_data], ignore_index=True 273 ) 274 assert len_before == len(data), "Number of patients changed" 275 return data 276 277 278@log_state() 279def exclude_patients(raw: pd.DataFrame, exclude: list[tuple[str, Any]]): 280 """ 281 Exclude patients in the `raw` data based on a list of what to `exclude`. This 282 list contains tuples `(column, check)`. The `check` function will then exclude 283 any patients from the cohort where `check(raw[column])` evaluates to `True`. 284 285 Example: 286 >>> exclude = [("age", lambda s: s > 50)] 287 >>> table = pd.DataFrame({ 288 ... "age": [43, 82, 18, 67], 289 ... "T-category": [ 3, 4, 2, 1], 290 ... }) 291 >>> exclude_patients(table, exclude) 292 age T-category 293 0 43 3 294 2 18 2 295 """ 296 for column, check in exclude: 297 exclude = check(raw[column]) 298 raw = raw.loc[~exclude] 299 return raw 300 301 302def main(args: argparse.Namespace): 303 """ 304 The main entry point for the CLI of this command. Upon requesting `lyscripts 305 data lyproxify --help`, this is the help output: 306 307 ``` 308 USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]] 309 -o OUTPUT -m MAPPING 310 [--drop-rows DROP_ROWS [DROP_ROWS ...]] 311 [--drop-cols DROP_COLS [DROP_COLS ...]] 312 [--add-index] 313 314 Consumes raw data and transforms it into a CSV of the format that LyProX can 315 understand. 316 317 To do so, it needs a dictionary that defines a mapping from raw columns to the 318 LyProX style data format. See the documentation of the `transform_to_lyprox` 319 function for more information. 320 321 OPTIONAL ARGUMENTS: 322 -h, --help show this help message and exit 323 -i, --input INPUT Location of raw CSV data. (default: None) 324 -r, --header-rows HEADER_ROWS [HEADER_ROWS ...] 325 List with header row indices of raw file. (default: [0]) 326 -o, --output OUTPUT Location to store the lyproxified CSV file. (default: 327 None) 328 -m, --mapping MAPPING 329 Location of the Python file that contains column mapping 330 instructions. This must contain a dictionary with the name 331 'column_map'. (default: None) 332 --drop-rows DROP_ROWS [DROP_ROWS ...] 333 Delete rows of specified indices. Counting of rows start 334 at 0 _after_ the `header-rows`. (default: []) 335 --drop-cols DROP_COLS [DROP_COLS ...] 336 Delete columns of specified indices. (default: []) 337 --add-index If the data doesn't contain an index, add one by 338 enumerating the patients (default: False) 339 ``` 340 """ 341 raw: pd.DataFrame = load_patient_data(args.input) 342 raw = clean_header(raw, num_cols=raw.shape[1], num_header_rows=len(args.header_rows)) 343 344 cols_to_drop = raw.columns[args.drop_cols] 345 trimmed = raw.drop(cols_to_drop, axis="columns") 346 trimmed = trimmed.drop(index=args.drop_rows) 347 trimmed = trimmed.dropna(axis="index", how="all") 348 logger.info(f"Dropped rows {args.drop_rows} and columns {cols_to_drop}.") 349 350 spec = importlib.util.spec_from_file_location("map_module", args.mapping) 351 mapping = importlib.util.module_from_spec(spec) 352 spec.loader.exec_module(mapping) 353 logger.info(f"Imported mapping instructions from {args.mapping}") 354 355 reduced = exclude_patients(trimmed, mapping.EXCLUDE) 356 357 if args.add_index: 358 reduced.insert(0, ("patient", "#", "id"), list(range(len(reduced)))) 359 logger.info("Added index column to data.") 360 361 processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP) 362 363 if ("tumor", "1", "side") in processed.columns: 364 processed = leftright_to_ipsicontra(processed) 365 366 save_table_to_csv(args.output, processed)
Error while parsing the CSV file.
Inherited Members
- builtins.Exception
- Exception
- builtins.BaseException
- with_traceback
- args
94def clean_header( 95 table: pd.DataFrame, 96 num_cols: int, 97 num_header_rows: int, 98) -> pd.DataFrame: 99 """Rename the header cells in the `table`.""" 100 for col in range(num_cols): 101 for row in range(num_header_rows): 102 table.rename( 103 columns={f"Unnamed: {col}_level_{row}": f"{col}_lvl_{row}"}, 104 inplace=True, 105 ) 106 return table
Rename the header cells in the table
.
109def get_instruction_depth(nested_column_map: dict[tuple, dict[str, Any]]) -> int: 110 """ 111 Get the depth at which the column mapping instructions are nested. 112 113 Instructions are a dictionary that contains either a 'func' or 'default' key. 114 115 Example: 116 >>> nested_column_map = {"patient": {"age": {"func": int}}} 117 >>> get_instruction_depth(nested_column_map) 118 2 119 >>> flat_column_map = flatten(nested_column_map, max_depth=2) 120 >>> get_instruction_depth(flat_column_map) 121 1 122 >>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}} 123 >>> get_instruction_depth(nested_column_map) 124 Traceback (most recent call last): 125 ... 126 ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key. 127 """ 128 for _, value in nested_column_map.items(): 129 if isinstance(value, dict): 130 if "func" in value or "default" in value: 131 return 1 132 133 return 1 + get_instruction_depth(value) 134 135 raise ValueError( 136 "Leaf of column map must be a dictionary with 'func' or 'default' key." 137 )
Get the depth at which the column mapping instructions are nested.
Instructions are a dictionary that contains either a 'func' or 'default' key.
Example:
>>> nested_column_map = {"patient": {"age": {"func": int}}}
>>> get_instruction_depth(nested_column_map)
2
>>> flat_column_map = flatten(nested_column_map, max_depth=2)
>>> get_instruction_depth(flat_column_map)
1
>>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}}
>>> get_instruction_depth(nested_column_map)
Traceback (most recent call last):
...
ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key.
140def generate_markdown_docs( 141 nested_column_map: dict[tuple, dict[str, Any]], 142 depth: int = 0, 143 indent_len: int = 4, 144) -> str: 145 """ 146 Generate a markdown nested, ordered list as documentation for the column map. 147 148 A key in the doctionary is supposed to be documented, when its value is a dictionary 149 containing a `"__doc__"` key. 150 151 Example: 152 >>> nested_column_map = { 153 ... "patient": { 154 ... "__doc__": "some patient info", 155 ... "age": { 156 ... "__doc__": "age of the patient", 157 ... "func": int, 158 ... "columns": ["age"], 159 ... }, 160 ... }, 161 ... } 162 >>> generate_markdown_docs(nested_column_map) 163 '1. **`patient:`** some patient info\\n 1. **`age:`** age of the patient\\n' 164 """ 165 md_docs = "" 166 indent = " " * indent_len * depth 167 i = 1 168 for key, value in nested_column_map.items(): 169 if isinstance(value, dict): 170 if "__doc__" in value: 171 md_docs += f"{indent}{i}. **`{key}:`** {value['__doc__']}\n" 172 i += 1 173 174 md_docs += generate_markdown_docs(value, depth + 1, indent_len) 175 176 return md_docs
Generate a markdown nested, ordered list as documentation for the column map.
A key in the doctionary is supposed to be documented, when its value is a dictionary
containing a "__doc__"
key.
Example:
>>> nested_column_map = {
... "patient": {
... "__doc__": "some patient info",
... "age": {
... "__doc__": "age of the patient",
... "func": int,
... "columns": ["age"],
... },
... },
... }
>>> generate_markdown_docs(nested_column_map)
'1. **`patient:`** some patient info\n 1. **`age:`** age of the patient\n'
179@log_state() 180def transform_to_lyprox( 181 raw: pd.DataFrame, 182 column_map: dict[tuple, dict[str, Any]] 183) -> pd.DataFrame: 184 """ 185 Transform `raw` data frame into table that can be uploaded directly to [LyProX]. 186 187 To do so, it uses instructions in the `colum_map` dictionary, that needs to have 188 a particular structure: 189 190 For each column in the final 'lyproxified' `pd.DataFrame`, one entry must exist in 191 the `column_map` dctionary. E.g., for the column corresponding to a patient's age, 192 the dictionary should contain a key-value pair of this shape: 193 194 ```python 195 column_map = { 196 ("patient", "#", "age"): { 197 "func": compute_age_from_raw, 198 "kwargs": {"randomize": False}, 199 "columns": ["birthday", "date of diagnosis"] 200 }, 201 } 202 ``` 203 204 In this example, the function `compute_age_from_raw` is called with the values of 205 the columns `birthday` and `date of diagnosis` as positional arguments, and the 206 keyword argument `randomize` is set to `False`. The function then returns the 207 patient's age, which is subsequently stored in the column `("patient", "#", "age")`. 208 209 Note that the `column_map` dictionary must have either a `default` key or `func` 210 along with `columns` and `kwargs`, depending on the function definition. If the 211 function does not take any arguments, `columns` can be omitted. If it also does 212 not take any keyword arguments, `kwargs` can be omitted, too. 213 214 [LyProX]: https://lyprox.org 215 """ 216 column_map = delete_private_keys(column_map) 217 218 if (instruction_depth := get_instruction_depth(column_map)) > 1: 219 column_map = flatten(column_map, max_depth=instruction_depth) 220 221 multi_idx = pd.MultiIndex.from_tuples(column_map.keys()) 222 processed = pd.DataFrame(columns=multi_idx) 223 224 for multi_idx_col, instruction in column_map.items(): 225 if instruction != "": 226 if "default" in instruction: 227 processed[multi_idx_col] = [instruction["default"]] * len(raw) 228 elif "func" in instruction: 229 cols = instruction.get("columns", []) 230 kwargs = instruction.get("kwargs", {}) 231 func = instruction["func"] 232 233 try: 234 processed[multi_idx_col] = [ 235 func(*vals, **kwargs) for vals in raw[cols].values 236 ] 237 except Exception as exc: 238 raise ParsingError( 239 f"Exception encountered while parsing column {multi_idx_col}" 240 ) from exc 241 else: 242 raise ParsingError( 243 f"Column {multi_idx_col} has neither a `default` value nor `func` " 244 "describing how to fill this column." 245 ) 246 return processed
Transform raw
data frame into table that can be uploaded directly to [LyProX].
To do so, it uses instructions in the colum_map
dictionary, that needs to have
a particular structure:
For each column in the final 'lyproxified' pd.DataFrame
, one entry must exist in
the column_map
dctionary. E.g., for the column corresponding to a patient's age,
the dictionary should contain a key-value pair of this shape:
column_map = {
("patient", "#", "age"): {
"func": compute_age_from_raw,
"kwargs": {"randomize": False},
"columns": ["birthday", "date of diagnosis"]
},
}
In this example, the function compute_age_from_raw
is called with the values of
the columns birthday
and date of diagnosis
as positional arguments, and the
keyword argument randomize
is set to False
. The function then returns the
patient's age, which is subsequently stored in the column ("patient", "#", "age")
.
Note that the column_map
dictionary must have either a default
key or func
along with columns
and kwargs
, depending on the function definition. If the
function does not take any arguments, columns
can be omitted. If it also does
not take any keyword arguments, kwargs
can be omitted, too.
249@log_state() 250def leftright_to_ipsicontra(data: pd.DataFrame): 251 """ 252 Change absolute side reporting to tumor-relative. 253 254 Transform reporting of LNL involvement by absolute side (right & left) to a 255 reporting relative to the tumor (ipsi- & contralateral). The table `data` should 256 already be in the format LyProX requires, except for the side-reporting of LNL 257 involvement. 258 """ 259 len_before = len(data) 260 left_data = data.loc[ 261 data["tumor", "1", "side"] != "right" 262 ] 263 right_data = data.loc[ 264 data["tumor", "1", "side"] == "right" 265 ] 266 267 left_data = left_data.rename(columns={"left": "ipsi"}, level=1) 268 left_data = left_data.rename(columns={"right": "contra"}, level=1) 269 right_data = right_data.rename(columns={"left": "contra"}, level=1) 270 right_data = right_data.rename(columns={"right": "ipsi"}, level=1) 271 272 data = pd.concat( 273 [left_data, right_data], ignore_index=True 274 ) 275 assert len_before == len(data), "Number of patients changed" 276 return data
Change absolute side reporting to tumor-relative.
Transform reporting of LNL involvement by absolute side (right & left) to a
reporting relative to the tumor (ipsi- & contralateral). The table data
should
already be in the format LyProX requires, except for the side-reporting of LNL
involvement.
279@log_state() 280def exclude_patients(raw: pd.DataFrame, exclude: list[tuple[str, Any]]): 281 """ 282 Exclude patients in the `raw` data based on a list of what to `exclude`. This 283 list contains tuples `(column, check)`. The `check` function will then exclude 284 any patients from the cohort where `check(raw[column])` evaluates to `True`. 285 286 Example: 287 >>> exclude = [("age", lambda s: s > 50)] 288 >>> table = pd.DataFrame({ 289 ... "age": [43, 82, 18, 67], 290 ... "T-category": [ 3, 4, 2, 1], 291 ... }) 292 >>> exclude_patients(table, exclude) 293 age T-category 294 0 43 3 295 2 18 2 296 """ 297 for column, check in exclude: 298 exclude = check(raw[column]) 299 raw = raw.loc[~exclude] 300 return raw
Exclude patients in the raw
data based on a list of what to exclude
. This
list contains tuples (column, check)
. The check
function will then exclude
any patients from the cohort where check(raw[column])
evaluates to True
.
Example:
>>> exclude = [("age", lambda s: s > 50)]
>>> table = pd.DataFrame({
... "age": [43, 82, 18, 67],
... "T-category": [ 3, 4, 2, 1],
... })
>>> exclude_patients(table, exclude)
age T-category
0 43 3
2 18 2
303def main(args: argparse.Namespace): 304 """ 305 The main entry point for the CLI of this command. Upon requesting `lyscripts 306 data lyproxify --help`, this is the help output: 307 308 ``` 309 USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]] 310 -o OUTPUT -m MAPPING 311 [--drop-rows DROP_ROWS [DROP_ROWS ...]] 312 [--drop-cols DROP_COLS [DROP_COLS ...]] 313 [--add-index] 314 315 Consumes raw data and transforms it into a CSV of the format that LyProX can 316 understand. 317 318 To do so, it needs a dictionary that defines a mapping from raw columns to the 319 LyProX style data format. See the documentation of the `transform_to_lyprox` 320 function for more information. 321 322 OPTIONAL ARGUMENTS: 323 -h, --help show this help message and exit 324 -i, --input INPUT Location of raw CSV data. (default: None) 325 -r, --header-rows HEADER_ROWS [HEADER_ROWS ...] 326 List with header row indices of raw file. (default: [0]) 327 -o, --output OUTPUT Location to store the lyproxified CSV file. (default: 328 None) 329 -m, --mapping MAPPING 330 Location of the Python file that contains column mapping 331 instructions. This must contain a dictionary with the name 332 'column_map'. (default: None) 333 --drop-rows DROP_ROWS [DROP_ROWS ...] 334 Delete rows of specified indices. Counting of rows start 335 at 0 _after_ the `header-rows`. (default: []) 336 --drop-cols DROP_COLS [DROP_COLS ...] 337 Delete columns of specified indices. (default: []) 338 --add-index If the data doesn't contain an index, add one by 339 enumerating the patients (default: False) 340 ``` 341 """ 342 raw: pd.DataFrame = load_patient_data(args.input) 343 raw = clean_header(raw, num_cols=raw.shape[1], num_header_rows=len(args.header_rows)) 344 345 cols_to_drop = raw.columns[args.drop_cols] 346 trimmed = raw.drop(cols_to_drop, axis="columns") 347 trimmed = trimmed.drop(index=args.drop_rows) 348 trimmed = trimmed.dropna(axis="index", how="all") 349 logger.info(f"Dropped rows {args.drop_rows} and columns {cols_to_drop}.") 350 351 spec = importlib.util.spec_from_file_location("map_module", args.mapping) 352 mapping = importlib.util.module_from_spec(spec) 353 spec.loader.exec_module(mapping) 354 logger.info(f"Imported mapping instructions from {args.mapping}") 355 356 reduced = exclude_patients(trimmed, mapping.EXCLUDE) 357 358 if args.add_index: 359 reduced.insert(0, ("patient", "#", "id"), list(range(len(reduced)))) 360 logger.info("Added index column to data.") 361 362 processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP) 363 364 if ("tumor", "1", "side") in processed.columns: 365 processed = leftright_to_ipsicontra(processed) 366 367 save_table_to_csv(args.output, processed)
The main entry point for the CLI of this command. Upon requesting lyscripts
data lyproxify --help
, this is the help output:
USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]]
-o OUTPUT -m MAPPING
[--drop-rows DROP_ROWS [DROP_ROWS ...]]
[--drop-cols DROP_COLS [DROP_COLS ...]]
[--add-index]
Consumes raw data and transforms it into a CSV of the format that LyProX can
understand.
To do so, it needs a dictionary that defines a mapping from raw columns to the
LyProX style data format. See the documentation of the `transform_to_lyprox`
function for more information.
OPTIONAL ARGUMENTS:
-h, --help show this help message and exit
-i, --input INPUT Location of raw CSV data. (default: None)
-r, --header-rows HEADER_ROWS [HEADER_ROWS ...]
List with header row indices of raw file. (default: [0])
-o, --output OUTPUT Location to store the lyproxified CSV file. (default:
None)
-m, --mapping MAPPING
Location of the Python file that contains column mapping
instructions. This must contain a dictionary with the name
'column_map'. (default: None)
--drop-rows DROP_ROWS [DROP_ROWS ...]
Delete rows of specified indices. Counting of rows start
at 0 _after_ the `header-rows`. (default: [])
--drop-cols DROP_COLS [DROP_COLS ...]
Delete columns of specified indices. (default: [])
--add-index If the data doesn't contain an index, add one by
enumerating the patients (default: False)