lyscripts.data.lyproxify

Consumes raw data and transforms it into a CSV of the format that [LyProX] understands.

To do so, it needs a dictionary that defines a mapping from raw columns to the LyProX style data format. See the documentation of the transform_to_lyprox function for more information.

  1"""
  2Consumes raw data and transforms it into a CSV of the format that [LyProX] understands.
  3
  4To do so, it needs a dictionary that defines a mapping from raw columns to the LyProX
  5style data format. See the documentation of the `transform_to_lyprox` function for
  6more information.
  7
  8[LyProX]: https://lyprox.org
  9"""
 10# pylint: disable=logging-fstring-interpolation
 11import argparse
 12import importlib.util
 13import logging
 14import warnings
 15from pathlib import Path
 16from typing import Any
 17
 18import pandas as pd
 19
 20from lyscripts.data.utils import save_table_to_csv
 21from lyscripts.decorators import log_state
 22from lyscripts.utils import delete_private_keys, flatten, load_patient_data
 23
 24warnings.simplefilter(action="ignore", category=FutureWarning)
 25
 26
 27logger = logging.getLogger(__name__)
 28
 29
 30def _add_parser(
 31    subparsers: argparse._SubParsersAction,
 32    help_formatter,
 33):
 34    """
 35    Add an `ArgumentParser` to the subparsers action.
 36    """
 37    parser = subparsers.add_parser(
 38        Path(__file__).name.replace(".py", ""),
 39        description=__doc__,
 40        help=__doc__,
 41        formatter_class=help_formatter,
 42    )
 43    _add_arguments(parser)
 44
 45
 46def _add_arguments(parser: argparse.ArgumentParser):
 47    """
 48    Add arguments needed to run this script to a `subparsers` instance
 49    and run the respective main function when chosen.
 50    """
 51    parser.add_argument(
 52        "-i", "--input", type=Path, required=True,
 53        help="Location of raw CSV data."
 54    )
 55    parser.add_argument(
 56        "-r", "--header-rows", nargs="+", default=[0], type=int,
 57        help="List with header row indices of raw file."
 58    )
 59    parser.add_argument(
 60        "-o", "--output", type=Path, required=True,
 61        help="Location to store the lyproxified CSV file."
 62    )
 63    parser.add_argument(
 64        "-m", "--mapping", type=Path, required=True,
 65        help=(
 66            "Location of the Python file that contains column mapping instructions. "
 67            "This must contain a dictionary with the name 'column_map'."
 68        )
 69    )
 70    parser.add_argument(
 71        "--drop-rows", nargs="+", type=int, default=[],
 72        help=(
 73            "Delete rows of specified indices. Counting of rows start at 0 _after_ "
 74            "the `header-rows`."
 75        )
 76    )
 77    parser.add_argument(
 78        "--drop-cols", nargs="+", type=int, default=[],
 79        help="Delete columns of specified indices.",
 80    )
 81    parser.add_argument(
 82        "--add-index", action="store_true",
 83        help="If the data doesn't contain an index, add one by enumerating the patients"
 84    )
 85
 86    parser.set_defaults(run_main=main)
 87
 88
 89class ParsingError(Exception):
 90    """Error while parsing the CSV file."""
 91
 92
 93def clean_header(
 94    table: pd.DataFrame,
 95    num_cols: int,
 96    num_header_rows: int,
 97) -> pd.DataFrame:
 98    """Rename the header cells in the `table`."""
 99    for col in range(num_cols):
100        for row in range(num_header_rows):
101            table.rename(
102                columns={f"Unnamed: {col}_level_{row}": f"{col}_lvl_{row}"},
103                inplace=True,
104            )
105    return table
106
107
108def get_instruction_depth(nested_column_map: dict[tuple, dict[str, Any]]) -> int:
109    """
110    Get the depth at which the column mapping instructions are nested.
111
112    Instructions are a dictionary that contains either a 'func' or 'default' key.
113
114    Example:
115    >>> nested_column_map = {"patient": {"age": {"func": int}}}
116    >>> get_instruction_depth(nested_column_map)
117    2
118    >>> flat_column_map = flatten(nested_column_map, max_depth=2)
119    >>> get_instruction_depth(flat_column_map)
120    1
121    >>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}}
122    >>> get_instruction_depth(nested_column_map)
123    Traceback (most recent call last):
124        ...
125    ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key.
126    """
127    for _, value in nested_column_map.items():
128        if isinstance(value, dict):
129            if "func" in value or "default" in value:
130                return 1
131
132            return 1 + get_instruction_depth(value)
133
134        raise ValueError(
135            "Leaf of column map must be a dictionary with 'func' or 'default' key."
136        )
137
138
139def generate_markdown_docs(
140    nested_column_map: dict[tuple, dict[str, Any]],
141    depth: int = 0,
142    indent_len: int = 4,
143) -> str:
144    """
145    Generate a markdown nested, ordered list as documentation for the column map.
146
147    A key in the doctionary is supposed to be documented, when its value is a dictionary
148    containing a `"__doc__"` key.
149
150    Example:
151    >>> nested_column_map = {
152    ...     "patient": {
153    ...         "__doc__": "some patient info",
154    ...         "age": {
155    ...             "__doc__": "age of the patient",
156    ...             "func": int,
157    ...             "columns": ["age"],
158    ...         },
159    ...     },
160    ... }
161    >>> generate_markdown_docs(nested_column_map)
162    '1. **`patient:`** some patient info\\n    1. **`age:`** age of the patient\\n'
163    """
164    md_docs = ""
165    indent = " " * indent_len * depth
166    i = 1
167    for key, value in nested_column_map.items():
168        if isinstance(value, dict):
169            if "__doc__" in value:
170                md_docs += f"{indent}{i}. **`{key}:`** {value['__doc__']}\n"
171                i += 1
172
173            md_docs += generate_markdown_docs(value, depth + 1, indent_len)
174
175    return md_docs
176
177
178@log_state()
179def transform_to_lyprox(
180    raw: pd.DataFrame,
181    column_map: dict[tuple, dict[str, Any]]
182) -> pd.DataFrame:
183    """
184    Transform `raw` data frame into table that can be uploaded directly to [LyProX].
185
186    To do so, it uses instructions in the `colum_map` dictionary, that needs to have
187    a particular structure:
188
189    For each column in the final 'lyproxified' `pd.DataFrame`, one entry must exist in
190    the `column_map` dctionary. E.g., for the column corresponding to a patient's age,
191    the dictionary should contain a key-value pair of this shape:
192
193    ```python
194    column_map = {
195        ("patient", "#", "age"): {
196            "func": compute_age_from_raw,
197            "kwargs": {"randomize": False},
198            "columns": ["birthday", "date of diagnosis"]
199        },
200    }
201    ```
202
203    In this example, the function `compute_age_from_raw` is called with the values of
204    the columns `birthday` and `date of diagnosis` as positional arguments, and the
205    keyword argument `randomize` is set to `False`. The function then returns the
206    patient's age, which is subsequently stored in the column `("patient", "#", "age")`.
207
208    Note that the `column_map` dictionary must have either a `default` key or `func`
209    along with `columns` and `kwargs`, depending on the function definition. If the
210    function does not take any arguments, `columns` can be omitted. If it also does
211    not take any keyword arguments, `kwargs` can be omitted, too.
212
213    [LyProX]: https://lyprox.org
214    """
215    column_map = delete_private_keys(column_map)
216
217    if (instruction_depth := get_instruction_depth(column_map)) > 1:
218        column_map = flatten(column_map, max_depth=instruction_depth)
219
220    multi_idx = pd.MultiIndex.from_tuples(column_map.keys())
221    processed = pd.DataFrame(columns=multi_idx)
222
223    for multi_idx_col, instruction in column_map.items():
224        if instruction != "":
225            if "default" in instruction:
226                processed[multi_idx_col] = [instruction["default"]] * len(raw)
227            elif "func" in instruction:
228                cols = instruction.get("columns", [])
229                kwargs = instruction.get("kwargs", {})
230                func = instruction["func"]
231
232                try:
233                    processed[multi_idx_col] = [
234                        func(*vals, **kwargs) for vals in raw[cols].values
235                    ]
236                except Exception as exc:
237                    raise ParsingError(
238                        f"Exception encountered while parsing column {multi_idx_col}"
239                    ) from exc
240            else:
241                raise ParsingError(
242                    f"Column {multi_idx_col} has neither a `default` value nor `func` "
243                    "describing how to fill this column."
244                )
245    return processed
246
247
248@log_state()
249def leftright_to_ipsicontra(data: pd.DataFrame):
250    """
251    Change absolute side reporting to tumor-relative.
252
253    Transform reporting of LNL involvement by absolute side (right & left) to a
254    reporting relative to the tumor (ipsi- & contralateral). The table `data` should
255    already be in the format LyProX requires, except for the side-reporting of LNL
256    involvement.
257    """
258    len_before = len(data)
259    left_data = data.loc[
260                data["tumor", "1", "side"] != "right"
261            ]
262    right_data = data.loc[
263                data["tumor", "1", "side"] == "right"
264            ]
265
266    left_data = left_data.rename(columns={"left": "ipsi"}, level=1)
267    left_data = left_data.rename(columns={"right": "contra"}, level=1)
268    right_data = right_data.rename(columns={"left": "contra"}, level=1)
269    right_data = right_data.rename(columns={"right": "ipsi"}, level=1)
270
271    data = pd.concat(
272                [left_data, right_data], ignore_index=True
273            )
274    assert len_before == len(data), "Number of patients changed"
275    return data
276
277
278@log_state()
279def exclude_patients(raw: pd.DataFrame, exclude: list[tuple[str, Any]]):
280    """
281    Exclude patients in the `raw` data based on a list of what to `exclude`. This
282    list contains tuples `(column, check)`. The `check` function will then exclude
283    any patients from the cohort where `check(raw[column])` evaluates to `True`.
284
285    Example:
286    >>> exclude = [("age", lambda s: s > 50)]
287    >>> table = pd.DataFrame({
288    ...     "age":        [43, 82, 18, 67],
289    ...     "T-category": [ 3,  4,  2,  1],
290    ... })
291    >>> exclude_patients(table, exclude)
292       age  T-category
293    0   43           3
294    2   18           2
295    """
296    for column, check in exclude:
297        exclude = check(raw[column])
298        raw = raw.loc[~exclude]
299    return raw
300
301
302def main(args: argparse.Namespace):
303    """
304    The main entry point for the CLI of this command. Upon requesting `lyscripts
305    data lyproxify --help`, this is the help output:
306
307    ```
308    USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]]
309                                    -o OUTPUT -m MAPPING
310                                    [--drop-rows DROP_ROWS [DROP_ROWS ...]]
311                                    [--drop-cols DROP_COLS [DROP_COLS ...]]
312                                    [--add-index]
313
314    Consumes raw data and transforms it into a CSV of the format that LyProX can
315    understand.
316
317    To do so, it needs a dictionary that defines a mapping from raw columns to the
318    LyProX style data format. See the documentation of the `transform_to_lyprox`
319    function for more information.
320
321    OPTIONAL ARGUMENTS:
322      -h, --help            show this help message and exit
323      -i, --input INPUT     Location of raw CSV data. (default: None)
324      -r, --header-rows HEADER_ROWS [HEADER_ROWS ...]
325                            List with header row indices of raw file. (default: [0])
326      -o, --output OUTPUT   Location to store the lyproxified CSV file. (default:
327                            None)
328      -m, --mapping MAPPING
329                            Location of the Python file that contains column mapping
330                            instructions. This must contain a dictionary with the name
331                            'column_map'. (default: None)
332      --drop-rows DROP_ROWS [DROP_ROWS ...]
333                            Delete rows of specified indices. Counting of rows start
334                            at 0 _after_ the `header-rows`. (default: [])
335      --drop-cols DROP_COLS [DROP_COLS ...]
336                            Delete columns of specified indices. (default: [])
337      --add-index           If the data doesn't contain an index, add one by
338                            enumerating the patients (default: False)
339    ```
340    """
341    raw: pd.DataFrame = load_patient_data(args.input)
342    raw = clean_header(raw, num_cols=raw.shape[1], num_header_rows=len(args.header_rows))
343
344    cols_to_drop = raw.columns[args.drop_cols]
345    trimmed = raw.drop(cols_to_drop, axis="columns")
346    trimmed = trimmed.drop(index=args.drop_rows)
347    trimmed = trimmed.dropna(axis="index", how="all")
348    logger.info(f"Dropped rows {args.drop_rows} and columns {cols_to_drop}.")
349
350    spec = importlib.util.spec_from_file_location("map_module", args.mapping)
351    mapping = importlib.util.module_from_spec(spec)
352    spec.loader.exec_module(mapping)
353    logger.info(f"Imported mapping instructions from {args.mapping}")
354
355    reduced = exclude_patients(trimmed, mapping.EXCLUDE)
356
357    if args.add_index:
358        reduced.insert(0, ("patient", "#", "id"), list(range(len(reduced))))
359        logger.info("Added index column to data.")
360
361    processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP)
362
363    if ("tumor", "1", "side") in processed.columns:
364        processed = leftright_to_ipsicontra(processed)
365
366    save_table_to_csv(args.output, processed)
logger = <Logger lyscripts.data.lyproxify (WARNING)>
class ParsingError(builtins.Exception):
90class ParsingError(Exception):
91    """Error while parsing the CSV file."""

Error while parsing the CSV file.

Inherited Members
builtins.Exception
Exception
builtins.BaseException
with_traceback
args
def clean_header( table: pandas.core.frame.DataFrame, num_cols: int, num_header_rows: int) -> pandas.core.frame.DataFrame:
 94def clean_header(
 95    table: pd.DataFrame,
 96    num_cols: int,
 97    num_header_rows: int,
 98) -> pd.DataFrame:
 99    """Rename the header cells in the `table`."""
100    for col in range(num_cols):
101        for row in range(num_header_rows):
102            table.rename(
103                columns={f"Unnamed: {col}_level_{row}": f"{col}_lvl_{row}"},
104                inplace=True,
105            )
106    return table

Rename the header cells in the table.

def get_instruction_depth(nested_column_map: dict[tuple, dict[str, typing.Any]]) -> int:
109def get_instruction_depth(nested_column_map: dict[tuple, dict[str, Any]]) -> int:
110    """
111    Get the depth at which the column mapping instructions are nested.
112
113    Instructions are a dictionary that contains either a 'func' or 'default' key.
114
115    Example:
116    >>> nested_column_map = {"patient": {"age": {"func": int}}}
117    >>> get_instruction_depth(nested_column_map)
118    2
119    >>> flat_column_map = flatten(nested_column_map, max_depth=2)
120    >>> get_instruction_depth(flat_column_map)
121    1
122    >>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}}
123    >>> get_instruction_depth(nested_column_map)
124    Traceback (most recent call last):
125        ...
126    ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key.
127    """
128    for _, value in nested_column_map.items():
129        if isinstance(value, dict):
130            if "func" in value or "default" in value:
131                return 1
132
133            return 1 + get_instruction_depth(value)
134
135        raise ValueError(
136            "Leaf of column map must be a dictionary with 'func' or 'default' key."
137        )

Get the depth at which the column mapping instructions are nested.

Instructions are a dictionary that contains either a 'func' or 'default' key.

Example:

>>> nested_column_map = {"patient": {"age": {"func": int}}}
>>> get_instruction_depth(nested_column_map)
2
>>> flat_column_map = flatten(nested_column_map, max_depth=2)
>>> get_instruction_depth(flat_column_map)
1
>>> nested_column_map = {"patient": {"__doc__": "some patient info", "age": 61}}
>>> get_instruction_depth(nested_column_map)
Traceback (most recent call last):
    ...
ValueError: Leaf of column map must be a dictionary with 'func' or 'default' key.
def generate_markdown_docs( nested_column_map: dict[tuple, dict[str, typing.Any]], depth: int = 0, indent_len: int = 4) -> str:
140def generate_markdown_docs(
141    nested_column_map: dict[tuple, dict[str, Any]],
142    depth: int = 0,
143    indent_len: int = 4,
144) -> str:
145    """
146    Generate a markdown nested, ordered list as documentation for the column map.
147
148    A key in the doctionary is supposed to be documented, when its value is a dictionary
149    containing a `"__doc__"` key.
150
151    Example:
152    >>> nested_column_map = {
153    ...     "patient": {
154    ...         "__doc__": "some patient info",
155    ...         "age": {
156    ...             "__doc__": "age of the patient",
157    ...             "func": int,
158    ...             "columns": ["age"],
159    ...         },
160    ...     },
161    ... }
162    >>> generate_markdown_docs(nested_column_map)
163    '1. **`patient:`** some patient info\\n    1. **`age:`** age of the patient\\n'
164    """
165    md_docs = ""
166    indent = " " * indent_len * depth
167    i = 1
168    for key, value in nested_column_map.items():
169        if isinstance(value, dict):
170            if "__doc__" in value:
171                md_docs += f"{indent}{i}. **`{key}:`** {value['__doc__']}\n"
172                i += 1
173
174            md_docs += generate_markdown_docs(value, depth + 1, indent_len)
175
176    return md_docs

Generate a markdown nested, ordered list as documentation for the column map.

A key in the doctionary is supposed to be documented, when its value is a dictionary containing a "__doc__" key.

Example:

>>> nested_column_map = {
...     "patient": {
...         "__doc__": "some patient info",
...         "age": {
...             "__doc__": "age of the patient",
...             "func": int,
...             "columns": ["age"],
...         },
...     },
... }
>>> generate_markdown_docs(nested_column_map)
'1. **`patient:`** some patient info\n    1. **`age:`** age of the patient\n'
@log_state()
def transform_to_lyprox( raw: pandas.core.frame.DataFrame, column_map: dict[tuple, dict[str, typing.Any]]) -> pandas.core.frame.DataFrame:
179@log_state()
180def transform_to_lyprox(
181    raw: pd.DataFrame,
182    column_map: dict[tuple, dict[str, Any]]
183) -> pd.DataFrame:
184    """
185    Transform `raw` data frame into table that can be uploaded directly to [LyProX].
186
187    To do so, it uses instructions in the `colum_map` dictionary, that needs to have
188    a particular structure:
189
190    For each column in the final 'lyproxified' `pd.DataFrame`, one entry must exist in
191    the `column_map` dctionary. E.g., for the column corresponding to a patient's age,
192    the dictionary should contain a key-value pair of this shape:
193
194    ```python
195    column_map = {
196        ("patient", "#", "age"): {
197            "func": compute_age_from_raw,
198            "kwargs": {"randomize": False},
199            "columns": ["birthday", "date of diagnosis"]
200        },
201    }
202    ```
203
204    In this example, the function `compute_age_from_raw` is called with the values of
205    the columns `birthday` and `date of diagnosis` as positional arguments, and the
206    keyword argument `randomize` is set to `False`. The function then returns the
207    patient's age, which is subsequently stored in the column `("patient", "#", "age")`.
208
209    Note that the `column_map` dictionary must have either a `default` key or `func`
210    along with `columns` and `kwargs`, depending on the function definition. If the
211    function does not take any arguments, `columns` can be omitted. If it also does
212    not take any keyword arguments, `kwargs` can be omitted, too.
213
214    [LyProX]: https://lyprox.org
215    """
216    column_map = delete_private_keys(column_map)
217
218    if (instruction_depth := get_instruction_depth(column_map)) > 1:
219        column_map = flatten(column_map, max_depth=instruction_depth)
220
221    multi_idx = pd.MultiIndex.from_tuples(column_map.keys())
222    processed = pd.DataFrame(columns=multi_idx)
223
224    for multi_idx_col, instruction in column_map.items():
225        if instruction != "":
226            if "default" in instruction:
227                processed[multi_idx_col] = [instruction["default"]] * len(raw)
228            elif "func" in instruction:
229                cols = instruction.get("columns", [])
230                kwargs = instruction.get("kwargs", {})
231                func = instruction["func"]
232
233                try:
234                    processed[multi_idx_col] = [
235                        func(*vals, **kwargs) for vals in raw[cols].values
236                    ]
237                except Exception as exc:
238                    raise ParsingError(
239                        f"Exception encountered while parsing column {multi_idx_col}"
240                    ) from exc
241            else:
242                raise ParsingError(
243                    f"Column {multi_idx_col} has neither a `default` value nor `func` "
244                    "describing how to fill this column."
245                )
246    return processed

Transform raw data frame into table that can be uploaded directly to [LyProX].

To do so, it uses instructions in the colum_map dictionary, that needs to have a particular structure:

For each column in the final 'lyproxified' pd.DataFrame, one entry must exist in the column_map dctionary. E.g., for the column corresponding to a patient's age, the dictionary should contain a key-value pair of this shape:

column_map = {
    ("patient", "#", "age"): {
        "func": compute_age_from_raw,
        "kwargs": {"randomize": False},
        "columns": ["birthday", "date of diagnosis"]
    },
}

In this example, the function compute_age_from_raw is called with the values of the columns birthday and date of diagnosis as positional arguments, and the keyword argument randomize is set to False. The function then returns the patient's age, which is subsequently stored in the column ("patient", "#", "age").

Note that the column_map dictionary must have either a default key or func along with columns and kwargs, depending on the function definition. If the function does not take any arguments, columns can be omitted. If it also does not take any keyword arguments, kwargs can be omitted, too.

@log_state()
def leftright_to_ipsicontra(data: pandas.core.frame.DataFrame):
249@log_state()
250def leftright_to_ipsicontra(data: pd.DataFrame):
251    """
252    Change absolute side reporting to tumor-relative.
253
254    Transform reporting of LNL involvement by absolute side (right & left) to a
255    reporting relative to the tumor (ipsi- & contralateral). The table `data` should
256    already be in the format LyProX requires, except for the side-reporting of LNL
257    involvement.
258    """
259    len_before = len(data)
260    left_data = data.loc[
261                data["tumor", "1", "side"] != "right"
262            ]
263    right_data = data.loc[
264                data["tumor", "1", "side"] == "right"
265            ]
266
267    left_data = left_data.rename(columns={"left": "ipsi"}, level=1)
268    left_data = left_data.rename(columns={"right": "contra"}, level=1)
269    right_data = right_data.rename(columns={"left": "contra"}, level=1)
270    right_data = right_data.rename(columns={"right": "ipsi"}, level=1)
271
272    data = pd.concat(
273                [left_data, right_data], ignore_index=True
274            )
275    assert len_before == len(data), "Number of patients changed"
276    return data

Change absolute side reporting to tumor-relative.

Transform reporting of LNL involvement by absolute side (right & left) to a reporting relative to the tumor (ipsi- & contralateral). The table data should already be in the format LyProX requires, except for the side-reporting of LNL involvement.

@log_state()
def exclude_patients( raw: pandas.core.frame.DataFrame, exclude: list[tuple[str, typing.Any]]):
279@log_state()
280def exclude_patients(raw: pd.DataFrame, exclude: list[tuple[str, Any]]):
281    """
282    Exclude patients in the `raw` data based on a list of what to `exclude`. This
283    list contains tuples `(column, check)`. The `check` function will then exclude
284    any patients from the cohort where `check(raw[column])` evaluates to `True`.
285
286    Example:
287    >>> exclude = [("age", lambda s: s > 50)]
288    >>> table = pd.DataFrame({
289    ...     "age":        [43, 82, 18, 67],
290    ...     "T-category": [ 3,  4,  2,  1],
291    ... })
292    >>> exclude_patients(table, exclude)
293       age  T-category
294    0   43           3
295    2   18           2
296    """
297    for column, check in exclude:
298        exclude = check(raw[column])
299        raw = raw.loc[~exclude]
300    return raw

Exclude patients in the raw data based on a list of what to exclude. This list contains tuples (column, check). The check function will then exclude any patients from the cohort where check(raw[column]) evaluates to True.

Example:

>>> exclude = [("age", lambda s: s > 50)]
>>> table = pd.DataFrame({
...     "age":        [43, 82, 18, 67],
...     "T-category": [ 3,  4,  2,  1],
... })
>>> exclude_patients(table, exclude)
   age  T-category
0   43           3
2   18           2
def main(args: argparse.Namespace):
303def main(args: argparse.Namespace):
304    """
305    The main entry point for the CLI of this command. Upon requesting `lyscripts
306    data lyproxify --help`, this is the help output:
307
308    ```
309    USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]]
310                                    -o OUTPUT -m MAPPING
311                                    [--drop-rows DROP_ROWS [DROP_ROWS ...]]
312                                    [--drop-cols DROP_COLS [DROP_COLS ...]]
313                                    [--add-index]
314
315    Consumes raw data and transforms it into a CSV of the format that LyProX can
316    understand.
317
318    To do so, it needs a dictionary that defines a mapping from raw columns to the
319    LyProX style data format. See the documentation of the `transform_to_lyprox`
320    function for more information.
321
322    OPTIONAL ARGUMENTS:
323      -h, --help            show this help message and exit
324      -i, --input INPUT     Location of raw CSV data. (default: None)
325      -r, --header-rows HEADER_ROWS [HEADER_ROWS ...]
326                            List with header row indices of raw file. (default: [0])
327      -o, --output OUTPUT   Location to store the lyproxified CSV file. (default:
328                            None)
329      -m, --mapping MAPPING
330                            Location of the Python file that contains column mapping
331                            instructions. This must contain a dictionary with the name
332                            'column_map'. (default: None)
333      --drop-rows DROP_ROWS [DROP_ROWS ...]
334                            Delete rows of specified indices. Counting of rows start
335                            at 0 _after_ the `header-rows`. (default: [])
336      --drop-cols DROP_COLS [DROP_COLS ...]
337                            Delete columns of specified indices. (default: [])
338      --add-index           If the data doesn't contain an index, add one by
339                            enumerating the patients (default: False)
340    ```
341    """
342    raw: pd.DataFrame = load_patient_data(args.input)
343    raw = clean_header(raw, num_cols=raw.shape[1], num_header_rows=len(args.header_rows))
344
345    cols_to_drop = raw.columns[args.drop_cols]
346    trimmed = raw.drop(cols_to_drop, axis="columns")
347    trimmed = trimmed.drop(index=args.drop_rows)
348    trimmed = trimmed.dropna(axis="index", how="all")
349    logger.info(f"Dropped rows {args.drop_rows} and columns {cols_to_drop}.")
350
351    spec = importlib.util.spec_from_file_location("map_module", args.mapping)
352    mapping = importlib.util.module_from_spec(spec)
353    spec.loader.exec_module(mapping)
354    logger.info(f"Imported mapping instructions from {args.mapping}")
355
356    reduced = exclude_patients(trimmed, mapping.EXCLUDE)
357
358    if args.add_index:
359        reduced.insert(0, ("patient", "#", "id"), list(range(len(reduced))))
360        logger.info("Added index column to data.")
361
362    processed = transform_to_lyprox(reduced, mapping.COLUMN_MAP)
363
364    if ("tumor", "1", "side") in processed.columns:
365        processed = leftright_to_ipsicontra(processed)
366
367    save_table_to_csv(args.output, processed)

The main entry point for the CLI of this command. Upon requesting lyscripts data lyproxify --help, this is the help output:

USAGE: lyscripts data lyproxify [-h] -i INPUT [-r HEADER_ROWS [HEADER_ROWS ...]]
                                -o OUTPUT -m MAPPING
                                [--drop-rows DROP_ROWS [DROP_ROWS ...]]
                                [--drop-cols DROP_COLS [DROP_COLS ...]]
                                [--add-index]

Consumes raw data and transforms it into a CSV of the format that LyProX can
understand.

To do so, it needs a dictionary that defines a mapping from raw columns to the
LyProX style data format. See the documentation of the `transform_to_lyprox`
function for more information.

OPTIONAL ARGUMENTS:
  -h, --help            show this help message and exit
  -i, --input INPUT     Location of raw CSV data. (default: None)
  -r, --header-rows HEADER_ROWS [HEADER_ROWS ...]
                        List with header row indices of raw file. (default: [0])
  -o, --output OUTPUT   Location to store the lyproxified CSV file. (default:
                        None)
  -m, --mapping MAPPING
                        Location of the Python file that contains column mapping
                        instructions. This must contain a dictionary with the name
                        'column_map'. (default: None)
  --drop-rows DROP_ROWS [DROP_ROWS ...]
                        Delete rows of specified indices. Counting of rows start
                        at 0 _after_ the `header-rows`. (default: [])
  --drop-cols DROP_COLS [DROP_COLS ...]
                        Delete columns of specified indices. (default: [])
  --add-index           If the data doesn't contain an index, add one by
                        enumerating the patients (default: False)