import pandas as pd
import numpy as np
import pathlib as pl
import datetime as dt
from typing import Union, Optional, Tuple, List
from functools import reduce #needed to allow merging of multiple datasets
# saving the name of the data directory
= pl.Path(r'..\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23')
data_folder
# saving the path to the timestamps file
= r'..\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Timesheets (1).csv'
time_stamps
# times columns
= ['Participant ID', 'Start of Baseline', 'End of Baseline', 'Start of Task 1', 'End of Task 1', 'Start of Recovery Period', 'End of Recovery Period']
time_columns
#saving path to output directory
= pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\output') output
Finometer to python example project.
This document is a working example of how I’ve written and apllied this code to you Project. I will have sent on the two data files in
I have converted this to markdown so that it can’t be re rerun. But the point of this cell is to clean up the case convention (upper and lowercase) used in the naming here.
# Loop through all files and folders in the tree containing the word 'participant'
for entry in data_folder.glob('**/*participant*'):
# Check if the entry is a file or folder
if entry.is_file() or entry.is_dir():
# Get the current name of the file or folder
current_name = entry.name
# Convert the name to sentence case using the title() method
new_name = current_name.replace('participant', 'Participant').title()
# Rename the file or folder with the new name
entry.rename(entry.with_name(new_name))
# Loop through all files in the tree containing the word 'participant'
for entry in data_folder.glob('**/*participant*'):
# Check if the entry is a file
if entry.is_file():
# Get the current suffix (file extension) of the file
current_suffix = entry.suffix
# Convert the suffix to lowercase using the lower() method
new_suffix = current_suffix.lower()
# Rename the file with the new suffix
new_entry = entry.with_suffix(new_suffix)
entry.rename(new_entry)
#Cleaining up from previous tests
for i in data_folder.glob('**/*.csv'):
i.unlink()
After this I need to work on finding the participants with two parts, then merging parts one and two together into a single file.
= []
multipart
for i in data_folder.iterdir():
if i.is_dir():
#check if ' Part 2' is in the stem of the folder
if 'Part ' in i.stem:
# save the folder name to a list
= i.name
folder_name
multipart.append(folder_name)
print(i) for i in multipart] [
Participant 23 Part 2_2022-11-04_17.00.16
Participant 30 Part 2_2022-11-08_11.24.07
Participant 65 Part 2_2022-11-22_11.18.00
[None, None, None]
Ok so we’ve found 3 participants with multipart studies.
Participant 23 Participant 30 Participant 65
I need to clean up the nameing of the first part of Participant 23, and then I need to work on combining parts 1 and 2 into one .txt
file for all of them. I’m documenting this here so that it’s accounted for. I’m going to do it in a code cell and then conver the cell to markdown to be safe.
# removeing the words ' Part 1' from the folder and file stems
for i in data_folder.glob('**/*'):
if ' Part 1' in i.stem:
new_stem = i.stem.replace(' Part 1', '')
new_name = i.parent.joinpath(new_stem + i.suffix)
i.rename(new_name)
print(new_name)
Now to do some text editing and resaving.
So what I need to do is 1. find the part 1 .txt
file and the part 2 .txt
file 2. read them both into memory 3. extract the data (which begins on line 9 of the .txt
files) from the part 2 file 4. write those lines onto the the end of the data in the part 1 folder without editing any of the earlier parts of the part 1 file 5. save this new version of the part 1 .txt
- maybe save a note on one of the blank lines that this is a new file, but this can’t change the header structure (add new lines before line 9)
Also, for the mean time I should save these new .txt
files in an output folder…
Let’s see what I can do.
# saving the path to both part 1 and part 2 of participant 23
= pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 23_2022-11-04_16.31.31\Participant 23_2022-11-04_16.31.31.txt')
part1_23 = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 23 Part 2_2022-11-04_17.00.16\Participant 23 Part 2_2022-11-04_17.00.16.txt')
part2_23 = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\output\Participant 23_2022-11-04_16.31.31.txt')
combined_23
def append_cardio_data(
str, pl.Path],
part1_file: Union[str, pl.Path],
part2_file: Union[str, pl.Path]] = None
output_file: Optional[Union[-> List[str]:
) '''
Appends cardiovascular data from part 2 file to part 1 file and optionally writes the combined data to an output file.
Parameters
----------
part1_file : Union[str, Path]
File path of the part 1 file.
part2_file : Union[str, Path]
File path of the part 2 file.
output_file : Union[str, Path], optional
File path of the output file to write the combined data (default is None).
Returns
-------
List[str]
List of combined cardiovascular data.
'''
# Step 1: Read in the part 1 file
with open(part1_file, 'r') as f:
= f.readlines()
lines_part1
# Step 2: Save the header above line 9 in file 1
= lines_part1[:8]
header_part1
# Step 3: Save the data from line 9 down from file 1
= lines_part1[8:]
data_part1
# Step 4: Save the data from line 9 down in file 2
with open(part2_file, 'r') as f:
= f.readlines()[9:]
lines_part2
# Step 5: Append the data from file 2 to the bottom of the data from file 1
= data_part1 + lines_part2
combined_data
if output_file is not None:
# Step 6: Write all three things to a .txt file
with open(output_file, 'w') as f:
f.writelines(header_part1)
f.writelines(combined_data)
return combined_data
# # Example usage
# append_cardio_data(part1_23, part2_23, combined_23)
So now when I run this I have a single file that combines both of the files for the participant (in this case Participant 23) into 1 .txt
file, when you inspect the file you can see the time jump, partly because there’s a skip in the 'Time (s)'
column, but also you can see the recalibration period as shown below
Time (s);Systolic Pressure (mmHg);Diastolic Pressure (mmHg);Mean Pressure (mmHg);Heart rate (bpm);Stroke Volume (ml);Left Ventricular Ejection Time (ms);Pulse Interval (ms);...
16:57:17.135;201;168;186;117;15.8;300;515;742;1.8;6.090;8120;;
17:00:19.705;0;0;0;103;0.0;0;585;0;0.0;0.000;0;;
17:00:20.290;0;0;0;103;0.0;0;580;0;0.0;0.000;0;;
17:00:20.870;0;0;0;76;0.0;0;785;0;0.0;0.000;0;;
17:00:21.655;0;0;0;31;0.0;0;1910;0;0.0;0.000;0;;
17:00:23.565;0;0;0;53;0.0;0;1130;0;0.0;0.000;0;;
17:00:24.695;0;0;0;100;0.0;0;600;0;0.0;0.000;0;;
17:00:25.295;0;0;0;98;0.0;0;615;0;0.0;0.000;0;;
17:00:25.910;0;0;0;99;0.0;0;605;0;0.0;0.000;0;;
17:00:26.515;0;0;0;101;0.0;0;595;0;0.0;0.000;0;;
17:00:27.110;0;0;0;102;0.0;0;590;0;0.0;0.000;0;;
17:00:27.700;0;0;0;98;0.0;0;615;0;0.0;0.000;0;;
17:00:28.315;0;0;0;93;0.0;0;645;0;0.0;0.000;0;;
17:00:28.960;0;0;0;90;0.0;0;670;0;0.0;0.000;0;;
17:00:29.630;0;0;0;97;0.0;0;620;0;0.0;0.000;0;;
17:00:30.250;0;0;0;91;0.0;0;660;0;0.0;0.000;0;;
17:00:30.910;0;0;0;94;0.0;0;640;0;0.0;0.000;0;;
17:00:31.550;0;0;0;92;0.0;0;650;0;0.0;0.000;0;;
17:00:32.200;0;0;0;90;0.0;0;670;0;0.0;0.000;0;;
17:00:32.870;0;0;0;86;0.0;0;700;0;0.0;0.000;0;;
17:00:33.570;0;0;0;88;0.0;0;685;0;0.0;0.000;0;;
17:00:34.255;0;0;0;86;0.0;0;695;0;0.0;0.000;0;;
17:00:34.950;0;0;0;83;0.0;0;720;0;0.0;0.000;0;;
17:00:35.670;0;0;0;80;0.0;0;750;0;0.0;0.000;0;;
17:00:36.420;0;0;0;77;0.0;0;775;0;0.0;0.000;0;;
17:00:37.194;176;112;136;90;46.0;295;670;1675;4.1;1.977;2636;;
Now that we know we have a function that will work we can run it on the 3 problem participants and overwrite the part 1 .txt
file with full data. This is a big part of why I wanted to document this process in a notebook so everyone can see exactly what’s been done. I can very easily call it on the participant 23 files because of the work up above
# commented out to avoid overwriting the file
# append_cardio_data(part1_23, part2_23, part1_23)
So now, Participant 23 has 1 file, in the folder with the correct name.
Lets do the same for Participants 30 and 65
# saving the path to both part 1 and part 2 of participant 30
= pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 30_2022-11-08_10.40.42\Participant 30_2022-11-08_10.40.42.txt')
p30_01 = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 30 Part 2_2022-11-08_11.24.07\Participant 30 Part 2_2022-11-08_11.24.07.txt')
p30_02
# saving the path to both part 1 and part 2 of participant 65
= pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 65_2022-11-22_10.40.43\Participant 65_2022-11-22_10.40.43.txt')
p65_01 = pl.Path(r'C:\Users\kevin.omalley\OneDrive - University of Limerick\Documents\GitHub\fino2py\all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 65 Part 2_2022-11-22_11.18.00\Participant 65 Part 2_2022-11-22_11.18.00.txt')
p65_02
# creating the combined file for participant 30
# append_cardio_data(p30_01, p30_02, p30_01)
# commented to avoid repeating it now that the original part 1 file has been overwritten
# creating the combined file for participant 65
# append_cardio_data(p65_01, p65_02, p65_01)
# commented to avoid repeating it now that the original part 1 file has been overwritten
The 3 participants with multiple parts have been written together into a single .txt
file (per participant), the original part 1 file has been overwritten to contain all the data and now I’m going to manually delete the part 2 folders for each person. This is why a ‘raw’ data corpus has been saved seperately as a zip archive so we can always go back and look at that data if needed.
Functions
From here down are the function definitions. I’m not sure currently how to split this up currently as the functions don’t follow on in order, but I’ll see how that feels from now on.
Times
The functions immediately below relate handling the various time objects. There are two types of times 1. The Time(s)
generated by the finometer as it captures data at each heart beat. - these are in the format ‘12:13:02.154’ ('%H:%M:%S.%f'
). 2. The ‘timestamps’ generated by the research as the mark the start and end of each testing stage in their protocol. - these are either in the format ‘12:13:02’ ('%H:%M:%S'
this is preferable), or ‘12:13’ ('%H:%M'
, this would indicate a minor issue in timekeeping)
These require different functions and they need to be defined before the importing and reshaping takes place.
def convert_fino_time(fino_time: str) -> dt.datetime.time:
"""
Converts the string times produced by the Finometer to datime objects in the '%H:%M:%S.%f' format.
Parameters:
----------
fino_time : str
The Finometer time in the format '%H:%M:%S.%f'
Returns:
-------
datetime.time
The time as a `datetime.time` object.
"""
try:
= dt.datetime.strptime(fino_time, '%H:%M:%S.%f').time()
time = time.strftime('%H:%M:%S')
time_str_no_ms = dt.datetime.strptime(time_str_no_ms, '%H:%M:%S').time()
time_obj_no_ms except Exception as e:
raise ValueError(f"Failed to convert time {fino_time} to datetime object. Error: {e}")
return time_obj_no_ms
def convert_timestamp_time(timestamp_time: str) -> dt.datetime.time:
"""
Convert the timestamp time in the format '09:02:12' to datetime.time object without microseconds.
Parameters:
----------
timestamp_time : str
The timestamp time in the format '09:02:12'
Returns:
-------
datetime.time
The time as a `datetime.time` object.
"""
try:
= dt.datetime.strptime(timestamp_time, '%H:%M:%S').time()
time except Exception as e:
raise ValueError(f"Failed to convert time {timestamp_time} to datetime object. Error: {e}")
return time
def convert_partial_time(partial_time: str) -> dt.datetime.time:
"""
Convert the partial time in the format '09:02' to a datetime.time object with 0 seconds.
Parameters:
----------
partial_time : str
The partial time in the format '09:02'
Returns:
-------
datetime.time
The time as a `datetime.time` object with 0 seconds.
"""
try:
= dt.datetime.strptime(partial_time, '%H:%M').time()
time = time.strftime('%H:%M:%S')
time_str = dt.datetime.strptime(time_str, '%H:%M:%S').time()
time_obj except Exception as e:
raise ValueError(f"Failed to convert time {partial_time} to datetime object. Error: {e}")
return time_obj
print(convert_partial_time('09:02'), type(convert_partial_time('09:02')))
09:02:00 <class 'datetime.time'>
print(convert_timestamp_time('09:02:12'), type(convert_timestamp_time('09:02:12')))
09:02:12 <class 'datetime.time'>
print(convert_fino_time('09:02:12.142'),type(convert_fino_time('09:02:12.142')))
09:02:12 <class 'datetime.time'>
These functions appear to return the kind of object that we need to do the work, and the edit all the required times into the same '%H:%M:%S'
format. So the next step is to define the functions that import the data from the finometer.
importing
def read_raw_finometer_data(folder_path: Union[str, pl.Path], interval: Optional[str] = None, save_csv: bool = False) -> Tuple[pd.DataFrame, str]:
'''This function imports the raw finometer data and then calculates the average of each measure over the selected time period
The default time period is 1 minute, but this can be changed by setting the interval parameter to a different value.
This function may not be needed in many cases, but it is useful to have, and a good place to start.
Parameters
----------
folder_path : pathlib.Path object or str
The path to the folder containing the .txt file
interval : str, optional
If provided, the function will resample the data to the given interval and return the resampled data.
save_csv : bool, optional
If True, the function will save the imported data as a .csv file in the same folder as the .txt file.
The default is False.
Raises
------
TypeError:
If folder_path is not a pathlib.Path object or a string
ValueError:
If folder_path does not exist or is not a directory
If there is not exactly one .txt file in the folder
Returns
-------
pandas.DataFrame:
Dataframe with the raw finometer data resampled to the given interval
ID : str
The Participant ID of the participant whose data is being imported
'''
try:
= pl.Path(folder_path)
folder_path except TypeError:
raise TypeError('folder_path must be a pathlib.Path object or a string')
if not folder_path.exists():
raise ValueError('folder_path does not exist')
if folder_path.is_dir():
= [file for file in folder_path.glob('*.txt')]
files if len(files) != 1:
raise ValueError(f'Expected one .txt file, but found {len(files)} in the folder {folder_path.name}')
file = files[0]
elif folder_path.is_file():
file = folder_path
= file.stem.split('_')[0]
ID
= pd.read_csv(
df file,
=';',
sep=0,
header=8,
skiprows=1,
skipfooter='python',
engine
)
= df[['Time (s)', 'Systolic Pressure (mmHg)', 'Diastolic Pressure (mmHg)',
df 'Mean Pressure (mmHg)', 'Heart rate (bpm)', 'Stroke Volume (ml)',
'Left Ventricular Ejection Time (ms)', 'Pulse Interval (ms)',
'Maximum Slope (mmHg/s)', 'Cardiac Output (l/min)',
'Total Peripheral Resistance Medical Unit (mmHg.min/l)',
'Total Peripheral Resistance CGS (dyn.s/cm5)', 'Markers']]
= pd.to_datetime(df['Time (s)'], format='%H:%M:%S.%f').dt.floor('ms')
df.index = df.drop('Time (s)', axis=1)
df
if interval: # if the user specifies an interval, resample the data to that interval
= df.resample(f'{interval}').mean()
df = df.index.strftime('%H:%M:%S.%f').str[:-3]
df.index else:
= df.index.strftime('%H:%M:%S.%f').str[:-3]
df.index
= folder_path / file.with_stem(f'imported data for {ID}').with_suffix('.csv')
csv_path if save_csv:
=True)
df.to_csv(csv_path, index
return df, ID
= read_raw_finometer_data([i for i in data_folder.iterdir() if i.is_dir()][0], save_csv=True) a, a_id
a.columns
Index(['Systolic Pressure (mmHg)', 'Diastolic Pressure (mmHg)',
'Mean Pressure (mmHg)', 'Heart rate (bpm)', 'Stroke Volume (ml)',
'Left Ventricular Ejection Time (ms)', 'Pulse Interval (ms)',
'Maximum Slope (mmHg/s)', 'Cardiac Output (l/min)',
'Total Peripheral Resistance Medical Unit (mmHg.min/l)',
'Total Peripheral Resistance CGS (dyn.s/cm5)', 'Markers'],
dtype='object')
def import_demographics(folder_path: str) -> pd.DataFrame:
"""
Reads in the demographics from the .txt file and returns a DataFrame row containing the data.
Parameters:
file_path (str): Path to the demographics file.
Returns:
demographics_df (pd.DataFrame): DataFrame containing the demographics data.
"""
try:
= pl.Path(folder_path)
folder_path except TypeError:
raise TypeError('folder_path must be a pathlib.Path object or a string')
if not folder_path.exists():
raise ValueError('folder_path does not exist')
if folder_path.is_dir():
= [file for file in folder_path.glob('*.txt')]
files if len(files) != 1:
raise ValueError(f'Expected one .txt file, but found {len(files)} in the folder')
file = files[0]
elif folder_path.is_file():
file = folder_path
= file.stem.split('_')[0]
ID
# Read in the demographics data from the file
= pd.read_csv(
df file,
=';',
sep=0,
header=2,
skiprows=1,
nrows='python'
engine
)
# Select the relevant columns from the DataFrame
= df.loc[:, ['Identification', 'Age (yrs)', 'Height (cm)', 'Weight (kg)', 'Gender']]
demographics_df
# Rename the columns
= ['Participant ID', 'Age (years)', 'Height (cm)', 'Weight (kg)', 'Gender']
demographics_df.columns
return demographics_df
def create_chunk(df, ID, tag, start, end):
"""
Create a chunk of data from a dataframe between specified start and end times and return a new dataframe
containing the mean values for each column in the chunk.
Parameters:
-----------
df : pandas DataFrame
The dataframe containing the data to extract a chunk from.
ID : str
The participant ID to include in the output dataframe.
tag : str
The tag to include in the column names of the output dataframe.
start : str or None
The start time of the chunk in the format 'HH:MM:SS' or 'HH:MM:SS.mmm'. If None, the chunk starts at the
beginning of the dataframe.
end : str or None
The end time of the chunk in the format 'HH:MM:SS' or 'HH:MM:SS.mmm'. If None, the chunk ends at the
end of the dataframe.
Returns:
--------
pandas DataFrame
A new dataframe containing the mean values for each column in the specified chunk of the input dataframe.
The output dataframe has a row for the specified participant ID and columns with names that include the
specified tag.
"""
# Convert the index to datetime (although this still feels like a kludge and I don't like it)
= pd.to_datetime(df.index, format='%H:%M:%S.%f')
df.index
# Extract the chunk of data and compute the mean values for each column
try:
if start and end:
= df.between_time(start_time=start, end_time=end).mean().to_frame().T
chunk elif start:
= df.between_time(start_time=start).mean().to_frame().T
chunk elif end:
= df.between_time(end_time=end).mean().to_frame().T
chunk else:
= df.mean().to_frame().T
chunk except Exception as e:
raise ValueError(f"Failed to extract chunk between {start} and {end}. Error: {e}")
# Rename the columns with the specified tag and insert the participant ID as the first column
= [f"{tag} {i}" for i in chunk.columns]
chunk.columns 0, 'Participant ID', ID)
chunk.insert(
return chunk
def import_protocol_times(times_file_path: Union[pl.Path,str], cols_to_keep: list, save_csv: bool = False) -> pd.DataFrame:
'''
This function imports the protocol times from a .csv file and returns a cleaned pandas dataframe with the protocol times for each participant.
Parameters
----------
times_file_path : pathlib.Path or str
The path to the .csv file containing the protocol times.
save_csv : bool, optional
If True, the imported data will be saved as a .csv file in the same folder as the .csv file.
Raises
------
TypeError:
If times_file_path is not a pathlib.Path object.
ValueError:
If times_file_path does not exist or is not a file.
If times_file_path does not have a .csv extension.
Returns
-------
pandas.DataFrame
A cleaned pandas dataframe with the protocol times for each participant.
'''
assert isinstance(times_file_path, (str, pl.Path)), 'file_path must be a pathlib.Path object or a string that can be converted to one.'
if isinstance(times_file_path, str):
= pl.Path(times_file_path)
times_file_path
assert times_file_path.exists(), 'file_path does not exist'
assert times_file_path.is_file(), 'file_path is not a file'
assert times_file_path.suffix == '.csv', 'file_path is not a csv file, please save times file as a .csv file'
def convert_time(time_str):
'''This function converts a time string to a datetime object if possible'''
try:
= time_str.strip('"')
time_str if len(time_str) == 5:
return convert_partial_time(time_str)
elif len(time_str) == 8:
return convert_timestamp_time(time_str)
else:
return np.nan
except Exception as e:
return np.nan
= pd.read_csv(times_file_path)
df = [col.strip() for col in df.columns]
df.columns
if cols_to_keep:
= df[cols_to_keep]
df
for col in df.columns[1:]:
= df[col].apply(convert_time)
df[col]
if save_csv: #if you want to save the csv file (which may be useful if you want to use the data in other ways)
try:
/ f"cleaned times.csv", index=False)
df.to_csv(times_file_path.parent print(f"CSV saved for {times_file_path.stem}")
except Exception as e:
print(f"Could not save csv file, error: {e}")
return df
= import_protocol_times(
y
time_stamps, time_columns)
y.head()
Participant ID | Start of Baseline | End of Baseline | Start of Task 1 | End of Task 1 | Start of Recovery Period | End of Recovery Period | |
---|---|---|---|---|---|---|---|
0 | Participant 1 | 09:17:20 | 09:27:20 | 09:28:11 | 09:35:10 | 09:35:15 | 09:43:15 |
1 | Participant 2 | 10:51:00 | 11:05:05 | 11:06:00 | 11:11:10 | 11:11:15 | 11:19:20 |
2 | Participant 3 | 12:20:36 | 12:30:36 | 12:34:22 | 12:38:57 | 12:38:59 | 12:46:04 |
3 | Participant 4 | 13:55:10 | 14:05:10 | 14:07:28 | 14:11:59 | 14:12:05 | 14:20:05 |
4 | Participant 5 | 15:13:00 | 15:23:00 | 15:25:00 | 15:30:00 | 15:30:00 | 15:38:00 |
any(axis=1)] y[y.isna().
Participant ID | Start of Baseline | End of Baseline | Start of Task 1 | End of Task 1 | Start of Recovery Period | End of Recovery Period | |
---|---|---|---|---|---|---|---|
53 | Participant 54 | NaN | NaN | NaN | NaN | NaN | NaN |
63 | Participant 64 | 09:20:00 | 09:30:00 | NaN | NaN | NaN | NaN |
124 | Participant 544 | NaN | NaN | NaN | NaN | NaN | NaN |
125 | Participant 545 | NaN | NaN | NaN | NaN | NaN | NaN |
126 | Participant 546 | NaN | NaN | NaN | NaN | NaN | NaN |
# testing version of the function
def import_protocol_averages(frame, id, times=None, save_csv=None):
'''A function that imports the averaged finometer files (which have already been processed from the raw data)
to produce averages for each section of the experimental protocol.
Parameters
----------
frame : pandas.DataFrame
The DataFrame containing the averaged finometer data
id : str
The participant ID
save_csv : bool, optional
If True, the imported data will be saved as a .csv file in the same folder as the .csv file,
this is not always needed and should be used sparingly
times : dict, optional
A dictionary of tuples of times, with the keys being the names of the time periods.
Returns
-------
pandas.DataFrame
A DataFrame with the mean values of the given columns during each time period of the study.
Raises
------
TypeError
If frame is not a pandas.DataFrame object
If id is not a string
ValueError
If times is not provided as a dictionary with at least one key-value pair
If there are not enough times provided for a given time period
If there are too many times provided for a given time period
'''
# check if frame is a pandas.DataFrame object
if not isinstance(frame, pd.DataFrame):
raise TypeError('''
frame must be a pandas.DataFrame object, produced by the read_raw_finometer_data function,
have you run the read_raw_finometer_data function on the data?''')
if not isinstance(id, str):
raise TypeError('id must be a string')
if not times:
raise ValueError("times must be a dictionary and at least one key-value pair must be provided.")
# Create an empty list of dataframes, each representing a chunk of the protocol
= []
chunks
for i in times.keys():
if len(times[i]) < 2:
raise ValueError(f"There are not enough times provided for the {i}.")
elif len(times[i]) > 2:
raise ValueError(f"There are too many times provided for the {i}.")
elif len(times[i]) == 2:
if times[i][0] < times[i][1]:
id, i, times[i][0], times[i][1]))
chunks.append(create_chunk(frame,
= reduce(lambda left, right: pd.merge(left, right, on=["Participant ID"], how="outer"), chunks)
data_merge 'Participant ID', inplace=True)
data_merge.set_index(
if save_csv:
= pl.Path(save_csv)
path / f"{id} protocol_averages.csv")
data_merge.to_csv( path print(f"Saved {id} protocol averages.csv to {path.stem}")
return data_merge
# this cell runs the functions on all the different files and writes them to a single dataframe
import warnings
'ignore')
warnings.filterwarnings(= []
dfs
for row in y.iterrows():
id = row[1][0]
= {'baseline' : [row[1][1], row[1][2]], 'task' : [row[1][3], row[1][4]], 'recovery' : [row[1][5], row[1][6]]}
times
for folder in data_folder.glob('**'):
if id == folder.stem.split('_')[0]:
= read_raw_finometer_data(folder)
df, df_id
try:
dfs.append(import_protocol_averages(df, df_id, times))except:
print(f"Could not import protocol averages for {id}")
'default')
warnings.filterwarnings(
= pd.concat(dfs, axis=0)
result_df
# Then I save them to an excel file
# result_df.to_excel(output/'Ailbhe data time one and two.xlsx')
# and view the first 20 rows
20) result_df.head(
baseline Systolic Pressure (mmHg) | baseline Diastolic Pressure (mmHg) | baseline Mean Pressure (mmHg) | baseline Heart rate (bpm) | baseline Stroke Volume (ml) | baseline Left Ventricular Ejection Time (ms) | baseline Pulse Interval (ms) | baseline Maximum Slope (mmHg/s) | baseline Cardiac Output (l/min) | baseline Total Peripheral Resistance Medical Unit (mmHg.min/l) | ... | recovery Mean Pressure (mmHg) | recovery Heart rate (bpm) | recovery Stroke Volume (ml) | recovery Left Ventricular Ejection Time (ms) | recovery Pulse Interval (ms) | recovery Maximum Slope (mmHg/s) | recovery Cardiac Output (l/min) | recovery Total Peripheral Resistance Medical Unit (mmHg.min/l) | recovery Total Peripheral Resistance CGS (dyn.s/cm5) | recovery Markers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Participant ID | |||||||||||||||||||||
Participant 1 | 201.177229 | 158.281418 | 175.578947 | 93.785177 | 16.931901 | 241.997852 | 644.624060 | 1120.077336 | 1.583673 | 6.766189 | ... | 184.357039 | 86.988389 | 15.967634 | 247.358491 | 696.262700 | 1243.576197 | 1.388824 | 8.275520 | 11034.023222 | NaN |
2 | 174.749768 | 119.989805 | 142.184430 | 78.142725 | 28.795366 | 219.295644 | 778.234476 | 1413.936979 | 2.239296 | 3.875071 | ... | 146.425325 | 76.980519 | 29.906818 | 288.733766 | 787.873377 | 1734.563312 | 2.287013 | 3.879511 | 5172.691558 | NaN |
Eve | 129.075871 | 83.501244 | 99.720149 | 84.395522 | 54.166667 | 266.194030 | 746.262438 | 1421.238806 | 4.529975 | 1.500659 | ... | 102.937500 | 81.082721 | 61.361949 | 276.966912 | 781.360294 | 1611.580882 | 4.899081 | 1.974364 | 2632.485294 | NaN |
4 | 138.128621 | 92.714948 | 109.310545 | 86.833140 | 40.428158 | 267.937428 | 695.185400 | 1201.106605 | 3.502549 | 1.892425 | ... | 119.121508 | 90.459497 | 35.469972 | 264.385475 | 670.181564 | 1402.484637 | 3.204050 | 2.278200 | 3037.578212 | NaN |
5 | 93.990654 | 80.220561 | 85.323364 | 64.459813 | 18.105794 | 289.803738 | 1026.046729 | 239.471028 | 1.179252 | 6.109763 | ... | 88.366730 | 67.362949 | 34.604537 | 293.667297 | 907.759924 | 397.132325 | 2.319660 | 2.364981 | 3153.315690 | NaN |
Participant 6 | 106.450131 | 79.707349 | 85.530184 | 73.707349 | 31.032021 | 294.967192 | 819.061680 | 623.863517 | 2.281365 | 2.324482 | ... | 83.608014 | 73.200348 | 33.649826 | 286.829268 | 838.736934 | 874.921603 | 2.455749 | 2.189387 | 2919.163763 | NaN |
8 | 116.379518 | 62.829819 | 83.042922 | 98.686747 | 92.937801 | 267.074548 | 611.573795 | 1552.516566 | 9.143599 | 0.554955 | ... | 91.721193 | 98.309671 | 89.174794 | 264.660494 | 617.217078 | 1940.726337 | 8.755041 | 0.652887 | 870.530864 | NaN |
Participant 8 Correct | 201.799232 | 144.032661 | 161.606148 | 109.024976 | 55.949952 | 285.806916 | 570.100865 | 1240.792507 | 6.113641 | 1.783168 | ... | 170.406286 | 102.499418 | 47.423050 | 278.981374 | 607.409779 | 1168.190920 | 4.862049 | 2.428739 | 3238.316647 | NaN |
9 | 125.233402 | 73.839212 | 91.713693 | 96.490664 | 48.927282 | 251.312241 | 622.702282 | 1395.449170 | 4.715975 | 1.170438 | ... | 101.525974 | 96.567532 | 37.484805 | 259.129870 | 623.337662 | 1236.240260 | 3.616364 | 1.707231 | 2276.292208 | NaN |
Participant 10 | 105.308287 | 71.392265 | 83.322652 | 73.269613 | 76.519337 | 296.044199 | 822.773481 | 620.046409 | 5.592928 | 0.900290 | ... | 88.494526 | 69.306569 | 83.577007 | 301.925182 | 876.195255 | 723.797445 | 5.779927 | 0.938234 | 1250.994526 | NaN |
Participant 11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Participant 11 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
Participant 11 | 130.189336 | 78.165397 | 99.374320 | 91.220892 | 45.370511 | 255.816104 | 663.803047 | 1458.297062 | 4.125027 | 1.467398 | ... | 91.247253 | 94.151099 | 45.262637 | 254.162088 | 642.431319 | 1407.892857 | 4.251786 | 1.304566 | 1739.418956 | NaN |
Participant 12 | 113.641154 | 79.105395 | 93.521957 | 79.949812 | 49.871016 | 275.690088 | 753.262233 | 830.787955 | 3.966123 | 1.421580 | ... | 101.213292 | 81.173107 | 61.067697 | 277.349304 | 742.272025 | 1097.418856 | 4.938949 | 1.237804 | 1650.420402 | NaN |
Participant 13 | 153.799539 | 94.494240 | 120.074885 | 73.370968 | 29.923618 | 284.487327 | 822.304147 | 1506.566820 | 2.184793 | 3.328818 | ... | 118.593614 | 65.931785 | 34.761974 | 299.825835 | 920.703919 | 1748.258345 | 2.276778 | 3.164370 | 4219.204644 | NaN |
Participant14 | 110.663357 | 65.892751 | 85.097319 | 101.382324 | 55.351936 | 249.021847 | 596.583913 | 1170.305859 | 5.583913 | 0.918202 | ... | 95.851064 | 101.436796 | 63.840926 | 248.873592 | 600.963705 | 1666.147685 | 6.453191 | 0.902253 | 1203.013767 | NaN |
15 | 146.770254 | 91.220073 | 110.955260 | 83.553809 | 94.748126 | 284.510278 | 725.622733 | 1377.804111 | 7.892624 | 0.851979 | ... | 121.593900 | 78.871589 | 93.343660 | 292.271268 | 770.882825 | 1666.252006 | 7.341413 | 1.003722 | 1338.282504 | NaN |
16 | 127.301105 | 89.838398 | 101.250000 | 72.772099 | 61.178315 | 288.038674 | 828.674033 | 801.943370 | 4.440884 | 1.374302 | ... | 103.336283 | 71.086726 | 54.882655 | 283.743363 | 849.628319 | 714.893805 | 3.890442 | 1.603179 | 2137.545133 | NaN |
Participant 17 | 134.065134 | 84.581098 | 102.873563 | 78.996169 | 62.525032 | 302.247765 | 765.593870 | 1037.118774 | 4.920690 | 1.268326 | ... | 104.625413 | 78.064356 | 62.809571 | 308.778878 | 776.245875 | 1111.288779 | 4.877723 | 1.303726 | 1738.303630 | NaN |
Participant 18 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 82.908333 | 69.183333 | 69.906667 | 283.000000 | 978.833333 | 2980.658333 | 4.875000 | 1.508275 | 2011.033333 | NaN |
20 rows × 36 columns
As we can see from the rows above, the files are being read in, but because the naming of the folders and files hasn’t been fixed the rows are very messy. For example, we can see that there are 3 rows the ‘Participant 11’ in the ‘Participant ID’ column. This is because there’s 3 different folders with the name ‘Participant 11’ followed by date and time information. This is because there are 3 different participant 11 (maybe there was multiple starts and stops with this person?).
If we look at the folder called 'all_data\Finometer Data Semester 1 and 2 14.04.23\Participant 3_2022-10-27_12.16.15'
we can see that the files are called 'Participant 3_2022-10-27_12.16.15\Eve_2022-10-27_12.16.15....'
so the filenames don’t match the folder names and the ID’s within the .txt
files match the file names and not the folders.
I need to fix this for this project (and make sure it doesn’t happen in future projects)
So lets start by deleting folders that don’t belong any more, this will include the duplicates and the ‘Part 2’ Folders that we mentioned earlier.
(This is being done manually so there’s no code cell for it…)
Then I need to iterate over every folder, then over the files in each folder, and if the file stems don’t match the folder name I need to replace them. This is worrying though because if we examine the particpant 7 folder, the .txt
file is called ’_2022-10-28_13.40.16\8_2022-10-28_13.40.16.txt`’
I’m just going to trust that the folder titles are correct and go with that.
#converting this cell to Markdown so it doesn't run again when knitting
#renaming the files stems to match the folder names
# iterate over the top level of the data folder
for i in data_folder.glob('*'):
#check to see if the file is a directory
if i.is_dir():
#iterate over the files in the directory
for j in i.glob('*'):
#check if the file stem is the same as the folder stem
if j.stem == i.stem:
pass
#if not, rename the file stem to match the folder stem with the original suffix
else:
j.rename(j.parent / f"{i.stem}{j.suffix}")
Now that that’s done, we can rerun the cell that made that results df for us again, I’m going to paste a new cell in to do this instead of just going back up and rerunning the cell, I save a second version of the excel file then, that is named for (hopefully) with the correct participant names in the ID columns
# this cell runs the functions on all the different files and writes them to a single dataframe
import warnings
'ignore')
warnings.filterwarnings(= []
dfs
for row in y.iterrows():
id = row[1][0]
= {'baseline' : [row[1][1], row[1][2]], 'task' : [row[1][3], row[1][4]], 'recovery' : [row[1][5], row[1][6]]}
times
for folder in data_folder.glob('**'):
if id == folder.stem.split('_')[0]:
= read_raw_finometer_data(folder)
df, df_id
try:
dfs.append(import_protocol_averages(df, df_id, times))except:
print(f"Could not import protocol averages for {id}")
'default')
warnings.filterwarnings(
= pd.concat(dfs, axis=0) new_result_df
# viewing the first 20 rows of the new dataframe
20) new_result_df.head(
baseline Systolic Pressure (mmHg) | baseline Diastolic Pressure (mmHg) | baseline Mean Pressure (mmHg) | baseline Heart rate (bpm) | baseline Stroke Volume (ml) | baseline Left Ventricular Ejection Time (ms) | baseline Pulse Interval (ms) | baseline Maximum Slope (mmHg/s) | baseline Cardiac Output (l/min) | baseline Total Peripheral Resistance Medical Unit (mmHg.min/l) | ... | recovery Mean Pressure (mmHg) | recovery Heart rate (bpm) | recovery Stroke Volume (ml) | recovery Left Ventricular Ejection Time (ms) | recovery Pulse Interval (ms) | recovery Maximum Slope (mmHg/s) | recovery Cardiac Output (l/min) | recovery Total Peripheral Resistance Medical Unit (mmHg.min/l) | recovery Total Peripheral Resistance CGS (dyn.s/cm5) | recovery Markers | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Participant ID | |||||||||||||||||||||
Participant 1 | 201.177229 | 158.281418 | 175.578947 | 93.785177 | 16.931901 | 241.997852 | 644.624060 | 1120.077336 | 1.583673 | 6.766189 | ... | 184.357039 | 86.988389 | 15.967634 | 247.358491 | 696.262700 | 1243.576197 | 1.388824 | 8.275520 | 11034.023222 | NaN |
Participant 2 | 174.749768 | 119.989805 | 142.184430 | 78.142725 | 28.795366 | 219.295644 | 778.234476 | 1413.936979 | 2.239296 | 3.875071 | ... | 146.425325 | 76.980519 | 29.906818 | 288.733766 | 787.873377 | 1734.563312 | 2.287013 | 3.879511 | 5172.691558 | NaN |
Participant 3 | 129.075871 | 83.501244 | 99.720149 | 84.395522 | 54.166667 | 266.194030 | 746.262438 | 1421.238806 | 4.529975 | 1.500659 | ... | 102.937500 | 81.082721 | 61.361949 | 276.966912 | 781.360294 | 1611.580882 | 4.899081 | 1.974364 | 2632.485294 | NaN |
Participant 4 | 138.128621 | 92.714948 | 109.310545 | 86.833140 | 40.428158 | 267.937428 | 695.185400 | 1201.106605 | 3.502549 | 1.892425 | ... | 119.121508 | 90.459497 | 35.469972 | 264.385475 | 670.181564 | 1402.484637 | 3.204050 | 2.278200 | 3037.578212 | NaN |
Participant 5 | 93.990654 | 80.220561 | 85.323364 | 64.459813 | 18.105794 | 289.803738 | 1026.046729 | 239.471028 | 1.179252 | 6.109763 | ... | 88.366730 | 67.362949 | 34.604537 | 293.667297 | 907.759924 | 397.132325 | 2.319660 | 2.364981 | 3153.315690 | NaN |
Participant 6 | 106.450131 | 79.707349 | 85.530184 | 73.707349 | 31.032021 | 294.967192 | 819.061680 | 623.863517 | 2.281365 | 2.324482 | ... | 83.608014 | 73.200348 | 33.649826 | 286.829268 | 838.736934 | 874.921603 | 2.455749 | 2.189387 | 2919.163763 | NaN |
Participant 7 | 116.379518 | 62.829819 | 83.042922 | 98.686747 | 92.937801 | 267.074548 | 611.573795 | 1552.516566 | 9.143599 | 0.554955 | ... | 91.721193 | 98.309671 | 89.174794 | 264.660494 | 617.217078 | 1940.726337 | 8.755041 | 0.652887 | 870.530864 | NaN |
Participant 8 | 201.799232 | 144.032661 | 161.606148 | 109.024976 | 55.949952 | 285.806916 | 570.100865 | 1240.792507 | 6.113641 | 1.783168 | ... | 170.406286 | 102.499418 | 47.423050 | 278.981374 | 607.409779 | 1168.190920 | 4.862049 | 2.428739 | 3238.316647 | NaN |
Participant 9 | 125.233402 | 73.839212 | 91.713693 | 96.490664 | 48.927282 | 251.312241 | 622.702282 | 1395.449170 | 4.715975 | 1.170438 | ... | 101.525974 | 96.567532 | 37.484805 | 259.129870 | 623.337662 | 1236.240260 | 3.616364 | 1.707231 | 2276.292208 | NaN |
Participant 10 | 105.308287 | 71.392265 | 83.322652 | 73.269613 | 76.519337 | 296.044199 | 822.773481 | 620.046409 | 5.592928 | 0.900290 | ... | 88.494526 | 69.306569 | 83.577007 | 301.925182 | 876.195255 | 723.797445 | 5.779927 | 0.938234 | 1250.994526 | NaN |
Participant 11 | 130.189336 | 78.165397 | 99.374320 | 91.220892 | 45.370511 | 255.816104 | 663.803047 | 1458.297062 | 4.125027 | 1.467398 | ... | 91.247253 | 94.151099 | 45.262637 | 254.162088 | 642.431319 | 1407.892857 | 4.251786 | 1.304566 | 1739.418956 | NaN |
Participant 12 | 113.641154 | 79.105395 | 93.521957 | 79.949812 | 49.871016 | 275.690088 | 753.262233 | 830.787955 | 3.966123 | 1.421580 | ... | 101.213292 | 81.173107 | 61.067697 | 277.349304 | 742.272025 | 1097.418856 | 4.938949 | 1.237804 | 1650.420402 | NaN |
Participant 13 | 153.799539 | 94.494240 | 120.074885 | 73.370968 | 29.923618 | 284.487327 | 822.304147 | 1506.566820 | 2.184793 | 3.328818 | ... | 118.593614 | 65.931785 | 34.761974 | 299.825835 | 920.703919 | 1748.258345 | 2.276778 | 3.164370 | 4219.204644 | NaN |
Participant 14 | 110.663357 | 65.892751 | 85.097319 | 101.382324 | 55.351936 | 249.021847 | 596.583913 | 1170.305859 | 5.583913 | 0.918202 | ... | 95.851064 | 101.436796 | 63.840926 | 248.873592 | 600.963705 | 1666.147685 | 6.453191 | 0.902253 | 1203.013767 | NaN |
Participant 15 | 146.770254 | 91.220073 | 110.955260 | 83.553809 | 94.748126 | 284.510278 | 725.622733 | 1377.804111 | 7.892624 | 0.851979 | ... | 121.593900 | 78.871589 | 93.343660 | 292.271268 | 770.882825 | 1666.252006 | 7.341413 | 1.003722 | 1338.282504 | NaN |
Participant 16 | 127.301105 | 89.838398 | 101.250000 | 72.772099 | 61.178315 | 288.038674 | 828.674033 | 801.943370 | 4.440884 | 1.374302 | ... | 103.336283 | 71.086726 | 54.882655 | 283.743363 | 849.628319 | 714.893805 | 3.890442 | 1.603179 | 2137.545133 | NaN |
Participant 17 | 134.065134 | 84.581098 | 102.873563 | 78.996169 | 62.525032 | 302.247765 | 765.593870 | 1037.118774 | 4.920690 | 1.268326 | ... | 104.625413 | 78.064356 | 62.809571 | 308.778878 | 776.245875 | 1111.288779 | 4.877723 | 1.303726 | 1738.303630 | NaN |
Participant 18 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 82.908333 | 69.183333 | 69.906667 | 283.000000 | 978.833333 | 2980.658333 | 4.875000 | 1.508275 | 2011.033333 | NaN |
Participant 19 | 130.547507 | 81.548448 | 101.300094 | 107.088429 | 48.333114 | 246.053622 | 564.256820 | 1665.835372 | 5.163594 | 1.187425 | ... | 103.176471 | 107.639706 | 46.662745 | 247.555147 | 562.261029 | 1504.511029 | 5.006250 | 1.255143 | 1673.507353 | NaN |
Participant 20 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | 90.977867 | 62.623742 | 58.312274 | 338.571429 | 965.462777 | 732.987928 | 3.639638 | 1.508684 | 2011.575453 | NaN |
20 rows × 36 columns
The head of that dataset looks much better, there’s an issue with participant 10 and 20 but everyone else looks good. Hopefully this data is enough for you to work with.
I’m going to save it to an excel file now.
/'Ailbhe data time one and two (naming fixed).xlsx') new_result_df.to_excel(output
OK, so you should now have 2 datasets, one which has loads of bad data in it to show you how important it is to get the protocol right, and one where the data has been pre cleaned a little. Hopefully this is enough for you to do the analysis with Stephen on Monday.
All the best,
Kev