Consumes raw data and transforms it into a CSV of the format that [LyProX] understands.

To do so, it needs a dictionary that defines a mapping from raw columns to the LyProX style data format. See the documentation of the transform_to_lyprox function for more information.

 10# pylint: disable=logging-fstring-interpolation
 11import argparse
 12import importlib.util
 13import logging
 14import warnings
 15from pathlib import Path
 16from typing import Any
 18import pandas as pd
 20from import save_table_to_csv
 21from lyscripts.decorators import log_state
 22from lyscripts.utils import delete_private_keys, flatten, load_patient_data
 24warnings.simplefilter(action="ignore", category=FutureWarning)
 27logger = logging.getLogger(__name__)
 30def _add_parser(
 31    subparsers: argparse._SubParsersAction,
 32    help_formatter,
 34    """
 35    Add an `ArgumentParser` to the subparsers action.
 36    """
 37    parser = subparsers.add_parser(
 38        Path(__file__).name.replace(".py", ""),
 39        description=__doc__,
 40        help=__doc__,
 41        formatter_class=help_formatter,
 42    )
 43    _add_arguments(parser)
 46def _add_arguments(parser: argparse.ArgumentParser):
 47    """
 48    Add arguments needed to run this script to a `subparsers` instance
 49    and run the respective main function when chosen.
 50    """
 51    parser.add_argument(
 52        "-i", "--input", type=Path, required=True,
 53        help="Location of raw CSV data."
 54    )
 55    parser.add_argument(
 56        "-r", "--header-rows", nargs="+", default=[0], type=int,
 57        help="List with header row indices of raw file."
 58    )
 59    parser.add_argument(
 60        "-o", "--output", type=Path, required=True,
 61        help="Location to store the lyproxified CSV file."
 62    )
 63    parser.add_argument(
 64        "-m", "--mapping", type=Path, required=True,
 65        help=(
 66            "Location of the Python file that contains column mapping instructions. "
 67            "This must contain a dictionary with the name 'column_map'."
 68        )
 69    )
 70    parser.add_argument(
 71        "--drop-rows", nargs="+", type=int, default=[],
 72        help=(
 73            "Delete rows of specified indices. Counting of rows start at 0 _after_ "
 74            "the `header-rows`."
 75        )
 76    )
 77    parser.add_argument(
 78        "--drop-cols", nargs="+", type=int, default=[],
 79        help="Delete columns of specified indices.",
 80    )
 81    parser.add_argument(
 82        "--add-index", action="store_true",
 83        help="If the data doesn't contain an index, add one by enumerating the patients"
 84    )
 86    parser.set_defaults(run_main=main)
 89class ParsingError(Exception):
 90    """Error while parsing the CSV file."""
 93def clean_header(
 94    table: pd.DataFrame,
 95    num_cols: int,
 96    num_header_rows: int,
 97) -> pd.DataFrame:
 98    """Rename the header cells in the `table`."""
 99    for col in range(num_cols):
100        for row in range(num_header_rows):
101            table.rename(
102                columns={f"Unnamed: {col}_level_{row}": f"{col}_lvl_{row}"},
103                inplace=True,
104            )
105    return table
108def get_instruction_depth(nested_column_map: dict[tuple, dict[str, Any]]) -> int:
109    """
110    Get the depth at which the column mapping instructions are nested.
112    Instructions are a dictionary that contains either a 'func' or 'default' key.
114    Example:
115    >>> nested_column_map = {"patient": {"age": {"func": int}}}
116    >>> get_instruction_depth(nested_column_map)
117    2
118    >>> flat_column_map = flatten(nested_column_map, max_depth=2)
119    >>> get_instruction_depth(flat_column_map)
120    1
121    >>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}}
122    >>> get_instruction_depth(nested_column_map)
123    Traceback (most recent call last):
124        ...
125    ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key.
126    """
127    for _, value in nested_column_map.items():
128        if isinstance(value, dict):
129            if "func" in value or "default" in value:
130                return 1
132            return 1 + get_instruction_depth(value)
134        raise ValueError(
135            "Leaf of column map must be a dictionary with 'func' or 'default' key."
136        )
139def generate_markdown_docs(
140    nested_column_map: dict[tuple, dict[str, Any]],
141    depth: int = 0,
142    indent_len: int = 4,
143) -> str:
144    """
145    Generate a markdown nested, ordered list as documentation for the column map.
147    A key in the doctionary is supposed to be documented, when its value is a dictionary
148    containing a `"__doc__"` key.
150    Example:
151    >>> nested_column_map = {
152    ...     "patient": {
153    ...         "__doc__": "some patient info",
154    ...         "age": {
155    ...             "__doc__": "age of the patient",
156    ...             "func": int,
157    ...             "columns": ["age"],
158    ...         },
159    ...     },
160    ... }
161    >>> generate_markdown_docs(nested_column_map)
162    '1. **`patient:`** some patient info\\n    1. **`age:`** age of the patient\\n'
163    """
164    md_docs = ""
165    indent = " " * indent_len * depth
166    i = 1
167    for key, value in nested_column_map.items():
168        if isinstance(value, dict):
169            if "__doc__" in value:
170                md_docs += f"{indent}{i}. **`{key}:`** {value['__doc__']}\n"
171                i += 1
173            md_docs += generate_markdown_docs(value, depth + 1, indent_len)
175    return md_docs
179def transform_to_lyprox(
180    raw: pd.DataFrame,
181    column_map: dict[tuple, dict[str, Any]]
182) -> pd.DataFrame:
183    """
184    Transform `raw` data frame into table that can be uploaded directly to [LyProX].
186    To do so, it uses instructions in the `colum_map` dictionary, that needs to have
187    a particular structure:
189    For each column in the final 'lyproxified' `pd.DataFrame`, one entry must exist in
190    the `column_map` dctionary. E.g., for the column corresponding to a patient's age,
191    the dictionary should contain a key-value pair of this shape:
193    ```python
194    column_map = {
195        ("patient", "#", "age"): {
196            "func": compute_age_from_raw,
197            "kwargs": {"randomize": False},
198            "columns": ["birthday", "date of diagnosis"]
199        },
200    }
201    ```
203    In this example, the function `compute_age_from_raw` is called with the values of
204    the columns `birthday` and `date of diagnosis` as positional arguments, and the
205    keyword argument `randomize` is set to `False`. The function then returns the
206    patient's age, which is subsequently stored in the column `("patient", "#", "age")`.
208    Note that the `column_map` dictionary must have either a `default` key or `func`
209    along with `columns` and `kwargs`, depending on the function definition. If the
210    function does not take any arguments, `columns` can be omitted. If it also does
211    not take any keyword arguments, `kwargs` can be omitted, too.
213    [LyProX]:
214    """
215    column_map = delete_private_keys(column_map)
217    if (instruction_depth := get_instruction_depth(column_map)) > 1:
218        column_map = flatten(column_map, max_depth=instruction_depth)
220    multi_idx = pd.MultiIndex.from_tuples(column_map.keys())
221    processed = pd.DataFrame(columns=multi_idx)
223    for multi_idx_col, instruction in column_map.items():
224        if instruction != "":
225            if "default" in instruction:
226                processed[multi_idx_col] = [instruction["default"]] * len(raw)
227            elif "func" in instruction:
228                cols = instruction.get("columns", [])
229                kwargs = instruction.get("kwargs", {})
230                func = instruction["func"]
232                try:
233                    processed[multi_idx_col] = [
234                        func(*vals, **kwargs) for vals in raw[cols].values
235                    ]
236                except Exception as exc:
237                    raise ParsingError(
238                        f"Exception encountered while parsing column {multi_idx_col}"
239                    ) from exc
240            else:
241                raise ParsingError(
242                    f"Column {multi_idx_col} has neither a `default` value nor `func` "
243                    "describing how to fill this column."
244                )
245    return processed
249def leftright_to_ipsicontra(data: pd.DataFrame):
250    """
251    Change absolute side reporting to tumor-relative.
253    Transform reporting of LNL involvement by absolute side (right & left) to a
254    reporting relative to the tumor (ipsi- & contralateral). The table `data` should
255    already be in the format LyProX requires, except for the side-reporting of LNL
256    involvement.
257    """
258    len_before = len(data)
259    left_data = data.loc[
260                data["tumor", "1", "side"] != "right"
261            ]
262    right_data = data.loc[
263                data["tumor", "1", "side"] == "right"
264            ]
266    left_data = left_data.rename(columns={"left": "ipsi"}, level=1)
267    left_data = left_data.rename(columns={"right": "contra"}, level=1)
268    right_data = right_data.rename(columns={"left": "contra"}, level=1)
269    right_data = right_data.rename(columns={"right": "ipsi"}, level=1)
271    data = pd.concat(
272                [left_data, right_data], ignore_index=True
273            )
274    assert len_before == len(data), "Number of patients changed"
275    return data
279def exclude_patients(raw: pd.DataFrame, exclude: list[tuple[str, Any]]):
280    """
281    Exclude patients in the `raw` data based on a list of what to `exclude`. This
282    list contains tuples `(column, check)`. The `check` function will then exclude
283    any patients from the cohort where `check(raw[column])` evaluates to `True`.
285    Example:
286    >>> exclude = [("age", lambda s: s > 50)]
287    >>> table = pd.DataFrame({
288    ...     "age":        [43, 82, 18, 67],
289    ...     "T-category": [ 3,  4,  2,  1],
290    ... })
291    >>> exclude_patients(table, exclude)
292       age  T-category
293    0   43           3
294    2   18           2
295    """
296    for column, check in exclude:
297        exclude = check(raw[column])
298        raw = raw.loc[~exclude]
299    return raw
302def main(args: argparse.Namespace):
303    """
304    The main entry point for the CLI of this command. Upon requesting `lyscripts
305    data lyproxify --help`, this is the help output:
307    ```
308    USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]]
309                                    -o OUTPUT -m MAPPING
310                                    [--drop-rows DROP_ROWS [DROP_ROWS ...]]
311                                    [--drop-cols DROP_COLS [DROP_COLS ...]]
312                                    [--add-index]
314    Consumes raw data and transforms it into a CSV of the format that LyProX can
315    understand.
317    To do so, it needs a dictionary that defines a mapping from raw columns to the
318    LyProX style data format. See the documentation of the `transform_to_lyprox`
319    function for more information.
322      -h, --help            show this help message and exit
323      -i, --input INPUT     Location of raw CSV data. (default: None)
324      -r, --header-rows HEADER_ROWS [HEADER_ROWS ...]
325                            List with header row indices of raw file. (default: [0])
326      -o, --output OUTPUT   Location to store the lyproxified CSV file. (default:
327                            None)
328      -m, --mapping MAPPING
329                            Location of the Python file that contains column mapping
330                            instructions. This must contain a dictionary with the name
331                            'column_map'. (default: None)
332      --drop-rows DROP_ROWS [DROP_ROWS ...]
333                            Delete rows of specified indices. Counting of rows start
334                            at 0 _after_ the `header-rows`. (default: [])
335      --drop-cols DROP_COLS [DROP_COLS ...]
336                            Delete columns of specified indices. (default: [])
337      --add-index           If the data doesn't contain an index, add one by
338                            enumerating the patients (default: False)
339    ```
340    """
341    raw: pd.DataFrame = load_patient_data(args.input)
342    raw = clean_header(raw, num_cols=raw.shape[1], num_header_rows=len(args.header_rows))
344    cols_to_drop = raw.columns[args.drop_cols]
345    trimmed = raw.drop(cols_to_drop, axis="columns")
346    trimmed = trimmed.drop(index=args.drop_rows)
347    trimmed = trimmed.dropna(axis="index", how="all")
348"Dropped rows {args.drop_rows} and columns {cols_to_drop}.")
350    spec = importlib.util.spec_from_file_location("map_module", args.mapping)
351    mapping = importlib.util.module_from_spec(spec)
352    spec.loader.exec_module(mapping)
353"Imported mapping instructions from {args.mapping}")
355    reduced = exclude_patients(trimmed, mapping.EXCLUDE)
357    if args.add_index:
358        reduced.insert(0, ("patient", "#", "id"), list(range(len(reduced))))
359"Added index column to data.")
361    processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP)
363    if ("tumor", "1", "side") in processed.columns:
364        processed = leftright_to_ipsicontra(processed)
366    save_table_to_csv(args.output, processed)
