diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 0000000..5d02ec4 --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,62 @@ +{ + "files": [ + "README.md" + ], + "imageSize": 100, + "commit": false, + "commitType": "docs", + "commitConvention": "angular", + "contributors": [ + { + "login": "RayStick", + "name": "Rachael Stickland", + "avatar_url": "https://avatars.githubusercontent.com/u/50215726?v=4", + "profile": "http://linkedin.com/in/rstickland-phd", + "contributions": [ + "maintenance", + "projectManagement", + "code", + "doc", + "ideas" + ] + }, + { + "login": "Rainiefantasy", + "name": "Mahwish Mohammad", + "avatar_url": "https://avatars.githubusercontent.com/u/43926907?v=4", + "profile": "https://github.com/Rainiefantasy", + "contributions": [ + "maintenance", + "projectManagement", + "code", + "doc", + "ideas" + ] + }, + { + "login": "BatoolMM", + "name": "Batool Almarzouq", + "avatar_url": "https://avatars.githubusercontent.com/u/53487593?v=4", + "profile": "https://batool-almarzouq.netlify.app/", + "contributions": [ + "review" + ] + }, + { + "login": "amallon", + "name": "Ann-Marie Mallon", + "avatar_url": "https://avatars.githubusercontent.com/u/35258603?v=4", + "profile": "https://github.com/amallon", + "contributions": [ + "projectManagement", + "ideas" + ] + } + ], + "contributorsPerLine": 7, + "skipCi": true, + "repoType": "github", + "repoHost": "https://github.com", + "projectName": "cprd-data-wrangle", + "projectOwner": "aim-rsf" +} diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..c17bf8c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,36 @@ +--- +name: Bug report +about: Create a report to help us improve +title: '' +labels: 'bug' +assignees: '' + +--- + +**Description of the bug:** + +- +- + +**Steps to reproduce the behaviour:** + + + + + +- +- + +**Expected behaviour:** + +- +- + +**My software set-up:** + + +- +- + +**Additional context:** + diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..cc3190a --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,21 @@ +--- +name: Feature request +about: Suggest a specific idea for this project +title: '' +labels: '' +assignees: '' + +--- + + + +**I would like to see this change implemented:** + + +- +- + +**Additional context or further questions:** + +- +- diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md new file mode 100644 index 0000000..4b2452c --- /dev/null +++ b/.github/ISSUE_TEMPLATE/question.md @@ -0,0 +1,13 @@ +--- +name: Question or Discussion +about: Not a bug report or direct feature request +title: '' +labels: 'question' +assignees: '' + +--- + +**Question/topic for discussion:** + +- +- diff --git a/.github/config.yml b/.github/config.yml new file mode 100644 index 0000000..34ee215 --- /dev/null +++ b/.github/config.yml @@ -0,0 +1,22 @@ +# Configuration for welcome - https://github.com/behaviorbot/welcome + +# Configuration for new-issue-welcome - https://github.com/behaviorbot/new-issue-welcome + +# Comment to be posted to on first time issues +newIssueWelcomeComment: > + + 🎉 Thank you for opening your first issue in this repo! Please check our [contribution guidelines](CONTRIBUTING.md) and be guided by the issue template. + +# Configuration for new-pr-welcome - https://github.com/behaviorbot/new-pr-welcome + +# Comment to be posted to on PRs from first time contributors in your repository +newPRWelcomeComment: > + + 🎉 Thanks for opening your first pull request in this repo! Please check our [contribution guidelines](CONTRIBUTING.md) and be guided by the PR template. + +# Configuration for first-pr-merge - https://github.com/behaviorbot/first-pr-merge + +# Comment to be posted to on pull requests merged by a first time user +firstPRMergeComment: > + + 🎉 Congrats on merging your first pull request in this repo! We appreciate your contribution! diff --git a/.github/labeler.yml b/.github/labeler.yml new file mode 100644 index 0000000..d2d1fb8 --- /dev/null +++ b/.github/labeler.yml @@ -0,0 +1,11 @@ +Documentation: + - changed-files: + - any-glob-to-any-file: ['README.md','CONTRIBUTING.md','cprd-code-browser.md'] + +Internal: + - changed-files: + - any-glob-to-any-file: ['.github/*','.all-contributorsrc','.gitignore'] + +# Testing: +# - changed-files: +# - any-glob-to-any-file: [] diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..2ee2b1b --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,19 @@ + + + +Closes # + + + +## Proposed Changes + + - + - + - + +## Checklist before review: + +- [ ] Please comment on my PR while it's a draft and give me feedback on the development! +- [ ] I added everything I wanted to add to this PR, please review! +- [ ] The title of this PR is clear and self-explanatory. +- [ ] I added any appropriate labels to this PR. diff --git a/.github/workflows/auto-author-assign.yml b/.github/workflows/auto-author-assign.yml new file mode 100644 index 0000000..8b102ae --- /dev/null +++ b/.github/workflows/auto-author-assign.yml @@ -0,0 +1,17 @@ +name: Auto Author Assign + +on: + issues: + types: [ opened, reopened ] + pull_request_target: + types: [ opened, reopened ] + +permissions: + pull-requests: write + issues: write + +jobs: + assign-author: + runs-on: ubuntu-latest + steps: + - uses: toshimaru/auto-author-assign@v2.1.0 diff --git a/.github/workflows/auto-label.yml b/.github/workflows/auto-label.yml new file mode 100644 index 0000000..a4c3f22 --- /dev/null +++ b/.github/workflows/auto-label.yml @@ -0,0 +1,15 @@ +name: auto-label +concurrency: + group: ${{ github.workflow }}-${{ github.event.number }}-${{ github.event.ref }} + cancel-in-progress: true +on: # yamllint disable-line rule:truthy + pull_request_target + +jobs: + pr: + permissions: + contents: read + pull-requests: write + runs-on: ubuntu-latest + steps: + - uses: actions/labeler@v5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..97918b3 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,29 @@ +# Contributing to cprd-data-wrangle + +We warmly welcome contributions to the cprd-data-wrangle project, however small or large. + +This document provides guidelines for contributing to this repository. + +## How to Contribute + +### Reporting Issues + +- **Bug Reports**: If you find a bug, please open an issue with a clear description of the problem and steps to reproduce it. +- **Feature Requests**: Suggestions for new features or improvements are always welcome. Please open an issue to discuss your ideas. + +### Making Changes + +1. **Fork the Repository**: Start by forking the repository to your GitHub account. +2. **Create a Feature Branch**: Create a new branch for your feature or fix. +3. **Make Your Changes**: Implement your changes, adhering to coding standards and best practices for Markdown and Python. +4. **Test Your Changes**: Ensure your changes do not break any existing functionality. +5. **Document Your Changes**: Include comments alongside your code, and update and relevant documentation files, as needed. +6. **Submit a Pull Request**: Open a pull request from your feature branch to the main branch of the original repository. + +## Questions or Need Help? + +If you have questions or need help, feel free to open an issue for discussion or reach out to the maintainers directly: + +Rachael Stickland (rstickland@turing.ac.uk) and Mahwish Mohammad (mmohammad@turing.ac.uk). + +Thank you for contributing to cprd-data-wrangle! diff --git a/README.md b/README.md index d94662f..edc659c 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,86 @@ -# cprd-data-wrangle -This repository is for anyone new to working with datasets released by the Clinical Practice Research Datalink (CPRD). Researchers tasked with understanding the database tables, then querying and filtering to create a research cohort, may find our pre-processing pipeline and interactive notebooks a helpful guide to getting started. + +[![All Contributors](https://img.shields.io/badge/all_contributors-4-orange.svg?style=flat-square)](#contributors-) + +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) -📢 This repository will be populated soon! 📢 In the meantime, check out this other repository for an introduction to synthetic data, in the context of health-care and biomedical research: https://github.com/aim-rsf/Synthetic-Data + +# 👋 Welcome + +## 👥 Who is this repository for? + +This repository is for anyone new to working with datasets released by the [Clinical Practice Research Datalink (CPRD)](https://cprd.com). Researchers tasked with understanding the database tables, then querying and filtering to create a research cohort, may find our pre-processing pipeline and interactive notebooks a helpful guide to getting started. + +**Please note:** + +- **You need your own copy of CPRD's synthetic/real data to run the code. This repository does not contain any data files.** + +- **CPRD are moving towards a TRE model of data access, instead of a researcher downloading data onto their own computer. Read more [here](https://www.cprd.com/cprd-safe-our-trusted-research-environment).** + +- **This is a work in progress repository. If you would like to suggest or contribute a change, please read our [contributor guide](CONTRIBUTING.md).** + +# 🥅 Project Goals + +We aim to streamline the process for researchers using CPRD datasets, with the creation of clear documentation, efficient data management strategies and analytical pipelines. We will start with development of workflows utilising CPRD's medium fidelity synthetic datasets because they resemble +> "the real world CPRD data with respect to the data types, data values, data formats, data structure and table relationships" [ref](https://cprd.com/synthetic-data). + +**New to Synthetic Data?** Read an introduction [here](https://github.com/aim-rsf/Synthetic-Data). + +We will create and share documentation & code, in openly available languages. We will start by loading the data into a relational database and summarising some of its main features. + +By working with our research collaborators, we aim to test workflows written with synthetic datasets on the real datasets to ensure transferability and utility. An anticipated mismatch will be the size of the data files and possibly the variability in file format. Please reach out to us if you want to test our code on your real CPRD data, or have any feedback on improving transferability and utility. + +CPRD's most recently released data specifications can be found [here for the real datasets](https://cprd.com/primary-care-data-public-health-research) and [here for the synthetic datasets](https://cprd.com/synthetic-data). + +# 💻 Current content + +We include information on [CPRD's Code Browser tool](cprd-code-browser.md) and how to request access to it. + +The [code-for-aurum](code-for-aurum) folder uses `Python` and `postgreSQL` to create a pre-processing workflow for CPRD Aurum data which includes a conversion of data file format for compatibility, and then reading the data into tables in a relational database. Workbooks have been created to familiarise a user with the CPRD Aurum tables, including how they link together and how to build a sample cohort: + +https://github.com/user-attachments/assets/9a636d4c-8170-4145-b6fc-60ac0f4c16d1 + +# 🤝 Contributions and Acknowledgments + +We acknowledge and thank these groups for making this project possible: + +- [National Institute for Health and Care Research (NIHR)](https://www.nihr.ac.uk/) for funding the AIM-RSF programme of work [NIHR202647] - see below. +- The [AI for Multiple Long Term Conditions Research Support Facility (AIM-RSF)](https://github.com/aim-rsf) programme for facilitating the delivery of this project. + - This repository was created and is maintained by the AIM-RSF, led by [Data Wranglers](https://book.the-turing-way.org/collaboration/research-infrastructure-roles/data-wrangler.html) Rachael Stickland & Mahwish Mohammad. +- [Clinical Practice Research Datalink (CPRD)](CPRD) for access to synthetic versions of their datasets [synthetic data request no: SD000021]. +- [The Alan Turing Institue](https://www.turing.ac.uk/). This project was supported in part through computational resources provided by The Alan Turing Institute under EPSRC grant EP/N510129/1. + +The views expressed within any file in this repository are those of the author(s) within the AIM-RSF programme, and not necessarily those of the: NIHR, Department of Health and Social Care, Medicines and Healthcare products Regulatory Agency (MHRA) or CPRD. + +## Thanks to specific contributors + +This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification, using the [emoji key](https://allcontributors.org/docs/en/emoji-key): + + + + + + + + + + + + + +
Rachael Stickland
Rachael Stickland

📆 🚧 💻 📖 🤔
Mahwish Mohammad
Mahwish Mohammad

🚧 💻 📖 🤔 👀
Batool Almarzouq
Batool Almarzouq

👀 🤔
Ann-Marie Mallon
Ann-Marie Mallon

📆 🤔
Kirstie Whitaker
Kirstie Whitaker

🤔
+ + + + + + +**Would you like to contribute?** Please read our [contributor guide](CONTRIBUTING.md). + +## ♻️ Licences + +This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for more details. + +--- + +You got to the end of the README? You get our :seal: of approval! diff --git a/code-for-aurum/README.md b/code-for-aurum/README.md new file mode 100644 index 0000000..20b8bdc --- /dev/null +++ b/code-for-aurum/README.md @@ -0,0 +1,58 @@ +# CPRD Aurum Synthetic Data Workflow + +We are assuming some familiarity with coding in order to follow this workflow. Step 1 of the workflow uses Python and PostgreSQL. To understand more about our environmental set-up and software configurations, see the [installation-setup.md](installation-setup.md) file. If you already have your CPRD Aurum data loaded into a database, you might want to skip to Step 2. + + +## [Step 1](Step1-PreProc): From text files to SQL tables + +Step 1 starts with the raw text files provided by CPRD, and formats them into relational tables in a SQL database. + +### Step 1A: Meta-data, from PDF to csv + +In order to create the tables in a SQL database, we require a machine readable metadata file which provides the *name of each data table* and the *field names* and *data types* within each table. This python code below creates a csv file for each table: + +`` python Step1A-Generate-metadata-csvs.py`` + +> ❗ Step1A is a manual process and is not best practice. For more details on our ideal workflow, read here [^1]. As metadata sources become more available, complete and accurate, some of these pre-processing steps will no longer be relevant and/or can be automated. This was the best solution we found at the time of analysis. + +### Step 1B: Data, from text to csv + +The files are provided by CPRD as flat files in text format. In this section, a Python script is used to convert the format from text to csv format. This gives more flexibility when using PostgreSQL's COPY command later in the workflow, to import the csv file to a relational table. More specifically, the script alters the field delimiter, wrapped quotes and datetime field formatting. + +To run this file conversion: + +``python Step1B-Generate-data-csvs.py`` + +### Step 1C: From csv to SQL table + +This section assumes that steps 1A and 1B have been completed, and a database has been created in PostgreSQL (in which you have permissions to write). A Python script is used to create a .sql file, based on the data and metadata csv files: + +``python Step1C-Generate-SQL-queries.py`` + +This will output a .sql file called `Step1C-create-tables.sql` + +Running this sql file, when connected to your specified database, will create the relational tables. + +## [Step 2](Step2-Notebooks): Workbooks (Notebook tutorials) + +### Step2A: Introduction to CPRD Aurum Sample (Synthetic) Dataset + +The aim of this notebook is provide familiarity with the tables that make up the CPRD Aurum Sample (Synthetic) Dataset. + +### Step2B: Summary statistics for CPRD Aurum Sample (Synthetic) Dataset + +The summary statistics created in this notebook follow the structure of those within the ['Release Notes: CPRD Aurum Sample Dataset October 2021'](https://www.cprd.com/sites/default/files/2022-02/CPRD%20Aurum%20Sample%20Dataset%20Release%20Notes.pdf) PDF. This notebook aims to replicate the numbers that CPRD provides using SQL commands, as an introduction to interacting with this dataset and the tables with SQL. + +### Step2C: CPRD Cohort Criteria Examples + +This notebook was created to replicate the example criteria given in CPRD Aurum FAQs v2.4 (see their [website](https://www.cprd.com/primary-care-data-public-health-research)). +This notebook uses these examples to increase understanding of the tables and explain how to write queries for example criteria. These types of queries would allow a research team to filter the CPRD data, to create a sample cohort that matches their research questions e.g. select patients within a certain age range and on a specific medication. + +### How to interact with the notebooks + +These Jupyter notebooks are intended to be interactive, because they contains markdown cells with explanatory text alongside cells with processing code, each which is rendered differently. They were written using Visual Studio Code using the Python and Jupyter extensions. To run a notebook, you need to be connected to a Python kernel. To be able to run PostgreSQL commands within a notebook you need to follow the steps explained in [installation-setup.md](installation-setup.md) under 'PostgreSQL Integration with Jupyter Notebook'. + +Alternatively, you can simply read the notebook as a guide and copy and paste the SQL commands into whatever interface you are using to interact with your database! + + +[^1]: This is a bit long-winded and hard-coded in places due to limitations of file inputs. Our ideal workflow looks something like [this](https://github.com/aim-rsf/cprd-data-wrangle/blob/main/code-for-aurum/workflow_idea.png). This python code Step 1A was written because the CPRD Aurum Data Specification is only provided in PDF format. It was created by copy and pasting manually from the PDF data spec (Version 2.9, Date: 27 April 2023). There is a chance of errors! This is temporary code that acts as a proof of principle - showing what the workflow can be with a machine readable metadata file. The look-up tables did not have table descriptions with the PDF so we made some sensible choices, choosing 'TEXT' or 'INTEGER' for data types. The ideal scenario would be to have regularly updated machine readable data specs (metadata). The format of the metadata must match the format of the data files. The *[Metadata Catalogue](https://modelcatalogue.cs.ox.ac.uk/hdruk_live/#/catalogue/dataModel/all)* allows you to easily download CPRD metadata in a machine-readable format (XML, JSON, XLSX). However, the metadata provides 'Column name' only, but the columns in CPRD data files are actually named after 'Field name'. This (and some other differences noted between these metadata files and the data files) meant we could not use the metadata catalogue files in an automated workflow. diff --git a/code-for-aurum/Step1-PreProc/Step1A-Generate-metadata-csvs.py b/code-for-aurum/Step1-PreProc/Step1A-Generate-metadata-csvs.py new file mode 100644 index 0000000..2651d91 --- /dev/null +++ b/code-for-aurum/Step1-PreProc/Step1A-Generate-metadata-csvs.py @@ -0,0 +1,344 @@ +# CPRD Aurum Data Specification, Version 2.9, Date: 27 April 2023 +# The metadata was copied manually from CPRD's PDF data specifications and pasted below +# This code takes this copied metadata and creates a csv file for each table +# All csv files are saved within 'metadata_csv' directory and used for Step1C + +import csv + +dataspec_version = 'v2p9' + +## Create quick function to avoid copy & pasting for every file +def write_medata_csv(file_name, row_list): + csv_name = 'metadata_csv/' + file_name + '-' + dataspec_version + '.csv' + with open(csv_name, 'w', newline='') as file: + writer = csv.writer(file) + writer.writerows(row_list) + +## 1. Patient +file_name = 'Patient' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Patient identifier","patid","TEXT","6-19 numeric characters","None","Encrypted unique identifier given to a patient in CPRD Aurum. The patient identifier is unique to CPRD Aurum and may represent a different patient in the CPRD GOLD database. This is the primary key for this table. The last 5 characters will be same as the CPRD practice identifier"], + ["CPRD practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Usual GP","usualgpstaffid","TEXT","Up to 10 numeric characters","Link Staff table","The GP that the patient is nominally registered with. To be used with the Staff table for reference"], + ["Gender","gender","INTEGER","3","Lookup: Gender.txt","Patient's gender"], + ["Year of birth","yob","INTEGER","4","None","Patient's year of birth. This is actual year of birth e.g. 1984."], + ["Month of birth","mob","INTEGER","2","None","Patient's month of birth (for those aged under 16)."], + ["Date of death","emis_ddate","DATE","DD/MM/YYYY","None","Date of death as recorded in the EMIS® software. Researchers are advised to treat the emis_ddate with caution and consider using the cprd_ddate variable below."], + ["Registration start date","regstartdate","DATE","DD/MM/YYYY","None","The date that the patient registered with the CPRD contributing practice. Most recent date the patient is recorded as having registered at the practice. If a patient deregistered for a period of time and returned, the return date would be recorded."], + ["Patient type","patienttypeid","INTEGER","5","Lookup: PatientType.txt","The category that the patient has been assigned to e.g. private, regular, temporary."], + ["Registration end date","regenddate","DATE","DD/MM/YYYY","None","Date the patient's registration at the practice ended. This may represent a transfer-out date or death date."], + ["Acceptable flag","acceptable","INTEGER","1","None","Flag to indicate whether the patient has met certain quality standards: 1 =acceptable, 0 = unacceptable"], + ["CPRD death date","cprd_ddate","DATE","DD/MM/YYYY","None","Estimated date of death of patient - derived using a CPRD algorithm"] + ] +write_medata_csv(file_name,row_list) + +## 2. Practice +file_name = 'Practice' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Practice identifier","pracid","INTEGER","5","None","Encrypted unique identifier given to a practice in CPRD Aurum. This is the primary key for this table."], + ["Last Collection Date","lcd","DATE","DD/MM/YYYY","None","Date of the most recent CPRD data collection for the practice."], + ["Up-to-standard date","uts","DATE","DD/MM/YYYY","None","The date at which the practice data is deemed to be of research quality, based on CPRD algorithm. [Not currently populated]"], + ["Region","region","INTEGER","5","Lookup: Region.txt","Value to indicate where in the UK the practice is based. The region denotes the ONS Region for English practices."] + ] +write_medata_csv(file_name,row_list) + +## 3. Staff +file_name = 'Staff' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Staff identifier","staffid","TEXT","Up to 10 numeric characters","None","Encrypted unique identifier given to the practice staff member in CPRD Aurum. This is the primary key for this table."], + ["Practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Job category","jobcatid","INTEGER","5","Lookup JobCat.txt","Job category of the staff member who created the event"] +] +write_medata_csv(file_name,row_list) + +## 4. Consultation +file_name = 'Consultation' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Patient identifier","patid","TEXT","6-19 numeric characters","Link Patient table","Encrypted unique identifier given to a patient in CPRD Aurum. The patient identifier is unique to CPRD Aurum and may represent a different patient in the CPRD GOLD database."], + ["Consultation identifier","consid","TEXT","Up to 19 numeric characters","None","Unique identifier given to the consultation. This is the primary key for this table."], + ["Practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Event date","consdate","DATE","DD/MM/YYYY","None","Date associated with the event"], + ["Entered date","enterdate","DATE","DD/MM/YYYY","None","Date the event was entered into the practice system"], + ["Staff identifier","staffid","TEXT","Up to 10 numeric characters","Link Staff table","Encrypted unique identifier given to the practice staff member who took the consultation in CPRD Aurum"], + ["EMIS® consultation source identifier","conssourceid","TEXT","Up to 10 numeric characters","Lookup: ConsSource.txt","Identifier that allows retrieval of anonymised information on the source or location of the consultation as recorded in the EMIS® software. Only the most frequently occurring strings have been anonymised and are included in the lookup. All others have been withheld by CPRD, pending anonymisation where feasible."], + ["CPRD consultation source identifier","cprdconstype","INTEGER","3","Lookup: cprdconstype.txt [not included in initial release]","Type of consultation: this will be generated by CPRD based on information recorded in the consmedcodeid and conssourceid variables. [Not currently populated]"], + ["Consultation source code identifier","consmedcodeid","TEXT","6-18 numeric characters","Medical dictionary. Maps to medcodeid","Source of the consultation from EMIS® software. This is a medical code that can be used with the medical dictionary. It may contain information similar to the consultation source identifiers but is available for use now. Some of the codes may not be interpretable e.g. Awaiting clinical code migration to EMIS Web®."] + ] +write_medata_csv(file_name,row_list) + +## 5. Observation +file_name = 'Observation' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Patient identifier","patid","TEXT","6-19 numeric characters","Link Patient table","Encrypted unique identifier given to a patient in CPRD Aurum. The patient identifier is unique to CPRD Aurum and may represent a different patient in the CPRD GOLD database."], + ["Consultation identifier","consid","TEXT","Up to 19 numeric characters","Link Consultation table","Linked consultation identifier. In EMIS Web® it is not necessary to enter observations within a consultation, so this identifier may be missing."], + ["Practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Observation identifier","obsid","TEXT","Up to 19 numeric characters","None","Unique identifier given to the observation. This is the primary key for this table."], + ["Event date","obsdate","DATE","DD/MM/YYYY","None","Date associated with the event"], + ["Entered date","enterdate","DATE","DD/MM/YYYY","None","Date the event was entered into EMIS Web®"], + ["Staff identifier","staffid","TEXT","Up to 10 numeric characters","Link Staff table","Encrypted unique identifier given to the practice staff member who took the consultation in CPRD Aurum"], + ["Parent observation identifier","parentobsid","TEXT","Up to 19 numeric characters","Link Observation table","Observation identifier (obsid) that is the parent to the observation. This enables grouping of multiple observations, such as systolic and diastolic blood pressure, or blood test results."], + ["Medical code","medcodeid","TEXT","6-18 numeric characters","Lookup: Medical dictionary","CPRD unique code for the medical term selected by the GP"], + ["Value","value","NUMERIC","19.3","None","Measurement or test value"], + ["Numeric unit identifier","numunitid","INTEGER","10","Lookup: NumUnit.txt","Unit of measurement"], + ["Observation type identifier","obstypeid","INTEGER","5","Lookup: ObsType.txt","Type of observation (allergy, family history, observation)"], + ["Numeric range low","numrangelow","NUMERIC","19.3","None","Value representing the low boundary of the normal range for this measurement"], + ["Numeric range high","numrangehigh","NUMERIC","19.3","None","Value representing the high boundary of the normal range for this measurement"], + ["Problem observation identifier","probobsid","TEXT","Up to 19 numeric characters","Link Observation table","Observation identifier (obsid) of any problem that an observation is associated with. An example of this might be an overarching condition that the current observation is associated with such as 'wheezing' with the problem observation identifier that links to an observation of 'asthma', that in turn contains information in the problem table."] + ] +write_medata_csv(file_name,row_list) + +## 5a. Referral +file_name = 'Referral' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Patient identifier","patid","TEXT","6-19 numeric characters","Link Patient table","Encrypted unique identifier given to a patient in CPRD Aurum. The patient identifier is unique to CPRD Aurum and may represent a different patient in the CPRD GOLD database."], + ["Observation identifier","obsid","TEXT","Up to 19 numeric characters","Link Observation table","Unique identifier given to the observation. This is the primary key for this table."], + ["Practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Referral source organisation identifier","refsourceorgid","INTEGER","10","Lookups: Organisation.txt [not included in initial release] and OrgType.txt","Source organisation of the referral. Organisations are identified by an ID number and each organisation has a type (e.g. hospital inpatient department, community clinic). Both the organisation table and the OrgType lookup are required. The lookups are undergoing anonymisation work. [Not currently populated]"], + ["Referral target organisation identifier","reftargetorgid","INTEGER","10","Lookups: Organisation.txt [not included in initial release] and OrgType.txt","Source organisation of the referral. Organisations are identified by an ID number and each organisation has a type (e.g. hospital inpatient department, community clinic). Both the organisation table and the OrgType lookup are required. The lookups are undergoing anonymisation work. [Not currently populated]"], + ["Referral urgency identifier","refurgencyid","INTEGER","1","Lookup: RefUrgency.txt","Urgency of the referral e.g. routine, urgent, dated"], + ["Referral service type identifier","refservicetypeid","INTEGER","2","Lookup: RefServiceType.txt","Type of service the referral relates to e.g. assessment, management, investigation"], + ["Referral mode identifier","refmodeid","INTEGER","1","Lookup: RefMode.txt","Mode by which the referral was made e.g. telephone, written"] + ] +write_medata_csv(file_name,row_list) + +## 5b. Problem +file_name = 'Problem' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Patient identifier","patid","TEXT","6-19 numeric characters","Link Patient table","Encrypted unique identifier given to a patient in CPRD Aurum. The patient identifier is unique to CPRD Aurum and may represent a different patient in the CPRD GOLD database."], + ["Observation identifier","obsid","TEXT","Up to 19 numeric characters","Link Observation table","Unique identifier given to the observation. This is the primary key for this table."], + ["Practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Parent problem observation identifier","parentprobobsid","TEXT","Up to 19 numeric characters","Link Observation table","Observation identifier for the parent observation of the problem. This can be used to group problems via the Observation table."], + ["Problem end date","probenddate","DATE","DD/MM/YYYY","None","Date the problem ended"], + ["Expected duration","expduration","INTEGER","5","None","Expected duration of the problem in days"], + ["Last review date","lastrevdate","DATE","DD/MM/YYYY","None","Date the problem was last reviewed"], + ["Last review staff identifier","lastrevstaffid","TEXT","Up to 10 numeric characters","Link Staff table","Staff member who last reviewed the problem"], + ["Parent problem relationship identifier","parentprobrelid","INTEGER","5","Lookup ParentProbRel.txt","Description of the relationship of the problem to another problem e.g. the problem may have evolved or been merged with another problem as the problem, or the GP’s understanding of the problem, changes"], + ["Problem status identifier","probstatusid","INTEGER","5","Lookup: ProbStatus.txt","Status of the problem (active, past)"], + ["Significance","signid","INTEGER","5","Lookup: Sign.txt","Significance of the problem (minor, significant)"], + ] +write_medata_csv(file_name,row_list) + +## 6. Drug Issue +file_name = 'DrugIssue' +row_list = [ + ["Column name","Field name","Type","Format","Mapping","Description"], + ["Patient identifier","patid","TEXT","6-19 numeric characters","Link Patient table","Encrypted unique identifier given to a patient in CPRD Aurum. The patient identifier is unique to CPRD Aurum and may represent a different patient in the CPRD GOLD database."], + ["Issue record identifier","issueid","TEXT","Up to 19 numeric characters","None","Unique identifier given to the issue record. This is the primary key for this table."], + ["Practice identifier","pracid","INTEGER","5","Link Practice table","Encrypted unique identifier given to a practice in CPRD Aurum"], + ["Problem observation identifier","probobsid","TEXT","Up to 19 numeric characters","Link Observation and Problem tables","Unique identifier for the observation that links to a problem under which this prescription was issued. This refers to an 'obsid' in the Observation table which, in turn, can be linked to a record in the Problem table using 'obsid'."], + ["Drug record identifier","drugrecid","TEXT","Up to 19 numeric characters","None","Unique identifier to a drug template record, which is not currently for release. At present this may be used to group repeat prescriptions from the same template."], + ["Event date","issuedate","DATE","DD/MM/YYYY","None","Date associated with the event"], + ["Entered date","enterdate","DATE","DD/MM/YYYY","None","Date the event was entered into EMIS Web®"], + ["Staff identifier","staffid","TEXT","Up to 10 numeric characters","Link Staff table","Encrypted unique identifier given to the practice staff member issued the treatment in CPRD Aurum"], + ["Drug code identifier","prodcodeid","TEXT","6-18 numeric characters","Lookup: Product dictionary","Unique CPRD code for the treatment selected by the GP"], + ["Dosage identifier","dosageid","TEXT","64 characters","Lookup: common_ dosages.txt","Identifier that allows dosage information on the event to be retrieved. Not included in first release"], + ["Quantity","quantity","DECIMAL","9.3 (The number before the decimal point gives the precision i.e. the total number of digits. The number after the decimal point denotes the scale number of decimal places)"," ","Total quantity entered by the GP for the prescribed treatment"], + ["Quantity unit identifier","quantunitid","INTEGER","2","Lookup: QuantUnit.txt","Unit of the treatment (capsule, tablet)"], + ["Course duration in days","duration","INTEGER","10","None","Duration of the treatment in days"], + ["Estimated NHS cost","estnhscost","DECIMAL","10.4 (The number before the decimal point gives the precision i.e. the total number of digits. The number after the decimal point denotes the scale number of decimal places)","None","Estimated cost of the treatment to the NHS"] + ] +write_medata_csv(file_name,row_list) + +## I. Medical dictionary +file_name = 'MedicalDictionary' +row_list = [ + ["Column name","Type","Format","Mapping","Description"], + ["medcodeid","TEXT","6-18 numeric characters","None","CPRD code to describe the observation. Links to the observation table. This is the primary key for this table."], + ["term","TEXT","255 characters","None","Description of the observation associated with the codeid"], + ["originalreadcode","TEXT","100 characters","None","The original Read code text as provided in the EMIS® dictionary (contains codes which are not valid Read codes)"], + ["cleansedreadcode","TEXT","7 characters","None","CPRD-cleaned and validated version of the originalreadcode"], + ["snomedctconceptid","TEXT","Up to 19 numeric characters","None","The SNOMED CT concept identifier associated with the observation"], + ["snomedctdescriptionid","TEXT","Up to 19 numeric characters","None","The SNOMED CT description identifier associated with the observation"], + ["release","TEXT","100 characters","None","Reference data version. [Not currently populated]"], + ["emiscodecategoryid","INTEGER","2","Lookup: EMISCodeCat.txt","Category identifier in EMIS® that describes the observation"] + ] +write_medata_csv(file_name,row_list) + +# ## II. Product dictionary +file_name = 'ProductDictionary' +row_list = [ + ["Column name","Type","Format","Mapping","Description"], + ["prodcodeid","TEXT","6-18 numeric characters","None","CPRD code to describe the treatment. Links to the Drug Issue table. This is the primary key for this table."], + ["dmdid","TEXT","Up to 19 numeric characters","None","The DM+D code associated with the treatment"], + ["termfromemis","TEXT","255 characters","None","Description of the treatment provided by EMIS® associated with the prodcodeid"], + ["productname","TEXT","Up to 999 characters","None","Name of the product"], + ["formulation","TEXT","Up to 999 characters","None","Formulation of the product"], + ["routeofadministration","TEXT","Up to 999 characters","None","Route of administration for the product"], + ["drugsubstancename","TEXT","Up to 999 characters","None","Active ingredient(s) included in the product. For combination therapies, each component is listed, separated by /"], + ["substancestrength","TEXT","Up to 999 characters","None","Strength of each active ingredient listed in the drugsubstancename column, including units. Separated by / if more than 1"], + ["bnfchapter","TEXT","Up to 999 characters","None","BNF chapter to which the product belongs"], + ["release","TEXT","100 characters","None","Reference data version. [Not currently populated]"] + ] +write_medata_csv(file_name,row_list) + +## LOOK UP TABLES + +### Gender +file_name = 'Gender' +row_list = [ + ["Field name","Type"], + ["genderid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### PatientType +file_name = 'PatientType' +row_list = [ + ["Field name","Type"], + ["patienttypeid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### Region +file_name = 'Region' +row_list = [ + ["Field name","Type"], + ["regionid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### JobCat +file_name = 'JobCat' +row_list = [ + ["Field name","Type"], + ["jobcatid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### ConsSource +file_name = 'ConsSource' +row_list = [ + ["Field name","Type"], + ["conssourceid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### ObsType +file_name = 'ObsType' +row_list = [ + ["Field name","Type"], + ["obstypeid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### NumUnit +file_name = 'NumUnit' +row_list = [ + ["Field name","Type"], + ["numunitid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### RefServiceType +file_name = 'RefServiceType' +row_list = [ + ["Field name","Type"], + ["refservicetypeid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### RefUrgency +file_name = 'RefUrgency' +row_list = [ + ["Field name","Type"], + ["refurgencyid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### OrgType +file_name = 'OrgType' +row_list = [ + ["Field name","Type"], + ["orgtypeid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### RefMode +file_name = 'RefMode' +row_list = [ + ["Field name","Type"], + ["refmodeid","INTEGER"], + ["Description","TEXT" ] + ] +write_medata_csv(file_name,row_list) + +### ParentProbRel +file_name = 'ParentProbRel' +row_list = [ + ["Field name","Type"], + ["parentprobrelid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### ProbStatus +file_name = 'ProbStatus' +row_list = [ + ["Field name","Type"], + ["probstatusid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### Sign +file_name = 'Sign' +row_list = [ + ["Field name","Type"], + ["signid","INTEGER"], + ["Description","TEXT"], + ] +write_medata_csv(file_name,row_list) + +### Common_dosages +# Reading all in as TEXT, but return to later: FLOAT, INT or DECIMAL may be more appropriate +file_name = 'Common_dosages' +row_list = [ + ["Field name","Type"], + ["dosageid","TEXT"], + ["dosage_text","TEXT"], + ["daily_dose","TEXT"], + ["does_number","TEXT"], + ["dose_unit","TEXT"], + ["dose_frequency","TEXT"], + ["dose_interval","TEXT"], + ["choice_of_dose","TEXT"], + ["dose_max_average","TEXT"], + ["change_dose","TEXT"], + ["dose_duration","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### QuantUnit +file_name = 'QuantUnit' +row_list = [ + ["Field name","Type"], + ["quantunitid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) + +### EMISCodeCat +file_name = 'EMISCodeCat' +row_list = [ + ["Field name","Type"], + ["emiscodecatid","INTEGER"], + ["Description","TEXT"] + ] +write_medata_csv(file_name,row_list) diff --git a/code-for-aurum/Step1-PreProc/Step1B-Generate-data-csvs.py b/code-for-aurum/Step1-PreProc/Step1B-Generate-data-csvs.py new file mode 100644 index 0000000..66d967b --- /dev/null +++ b/code-for-aurum/Step1-PreProc/Step1B-Generate-data-csvs.py @@ -0,0 +1,63 @@ +# Code snippet to generate the pre-processed csv files +# Run as: python Step1B-Generate-data-csv.py path-to-text-files +# User gives the directory path which contains (only) the cprd txt files to process +# The csv files are outputted into a new 'data_csv' sub-directory and used for Step1C +# (Example) list_of_filenames = ['Common_Dosages','ConsSource','Consultation','DrugIssue','EMISCodeCat','Gender','JobCat','MedicalDictionary','NumUnit','Observation','ObsType','OrgType','ParentProbRel','Patient','PatientType','Practice','Problem','ProbStatus','ProductDictionary','QuantUnit','Referral','RefMode','RefServiceType','RefUrgency','Region','Sign','Staff'] + +## Libraries +import os +import sys +import datetime +import csv +import numpy as np + +## Inputs and directories +path_from = sys.argv[1] +path_to = path_from + '/data_csv' + +## ! don't change code below + +list_of_filenames_ext = os.listdir(path_from) +list_of_filenames=[x.split('.')[0] for x in list_of_filenames_ext] # this assume no period (.) in the filename + +# create 'path_to' +if os.path.isdir(path_to): + print('There is already a csv directory. Exiting!') + exit() +else: + os.mkdir(path_to) + +# loop over txt files +for name in list_of_filenames: + print('reading:',name) + # read file into variable r + r = csv.reader(open(path_from + '/' + name + '.txt', 'r', encoding='latin1'), delimiter=' ',quotechar='"') + + #list to store data in and appending values + data=[] + for row in r: + #print(row) + data.append(row) + + #separating header of file to extract fields == 'lcd' OR that contain 'date' as substring (can tweak later to avoid hardcoding) + header = data[0] + print('header:',header) + date_fields = [idx for idx, x in enumerate(header) if ('date' in x) or (x == 'lcd')] + + #looping over all rows i and cols j of list + for i, row in enumerate(data): # loop rows + for j, col in enumerate(row): # loop cols in rows + #print('row:',i,'col:',j,col) + + #condition to filter datetime fields and reformat from dd/mm/yyyy to datetime format (YYYY-MM-DDT00:00:00.000) + if (len(col)>6) & (j in date_fields) & (i != 0): + data[i][j] = datetime.datetime.strptime(col, "%d/%m/%Y").strftime("%Y-%m-%d") + #print('new col:',j,data[i][j]) + + # open/create the file to write comma separated values to + with open(path_to + '/' + name + '.csv', 'w') as new_csv_file: + csv_writer = csv.writer(new_csv_file, delimiter=',') + for rows in data: + csv_writer.writerow(rows) + + print('Exported file',name,'.csv to location:', path_to) diff --git a/code-for-aurum/Step1-PreProc/Step1C-Generate-SQL-queries.py b/code-for-aurum/Step1-PreProc/Step1C-Generate-SQL-queries.py new file mode 100644 index 0000000..a231161 --- /dev/null +++ b/code-for-aurum/Step1-PreProc/Step1C-Generate-SQL-queries.py @@ -0,0 +1,62 @@ +# Code snippet to generate a file containing SQL queries which creates the individual tables, loads in data from data csv files, and reads the data +# Run as: python Step1C-Generate-SQL-queries.py path-to-files metadata_version +# E.g. python Step1C-Generate-SQL-queries.py /proc-data/SYN_AURUM v2p9 +# User gives the directory path which should contain two sub directories 'metadata_csv' and 'data_csv' created & populated from Step1A and Step1B respectively + +# Libraries +import os +import csv +import pandas as pd + +# Inputs and directories +data_input_path = sys.argv[1] + '/data_csv' #directory containing csv data files +metadata_input_path = sys.argv[1] + '/metadata_csv' #directory containing csv metadata files +output_path = sys.argv[1] + '/create-tables' +metadata_version = sys.argv[2] + +## ! don't change code below + +#assignment +directory = os.fsencode(data_input_path) +list_csv= [] + +#loop over data files in directory +for file in os.listdir(directory): + filename = os.fsdecode(file) + if filename.endswith(".csv"): + #filename without csv extension + name = filename.split('.')[0] + print('name:', name) + #read metadata to dataframe and replace datatypes to psql compatible format + df = pd.read_csv(metadata_input_path + name + "-" + metadata_version + ".csv") + df = df.replace('INTEGER','INT') + # zip column field names and data types + if 'Dictionary' in name: + pairs = zip(df['Column name'],df['Type']) + field_name = df['Column name'] + else: + pairs = zip(df['Field name'],df['Type']) + field_name = df['Field name'] + no_fields = len(df['Type']) + #print('no_fields:', no_fields) + + # append complete SQL query to csv file + list_csv.append( "DROP TABLE IF EXISTS " + name + ";\n" + \ + "CREATE TABLE " + name + " ("+ \ + " ".join([p[1][0] + " " + p[1][1] + "," if p[0]+1 != no_fields else p[1][0] + " " + p[1][1] for p in enumerate(pairs) ]) + \ + "); \n" + "COPY " + name + "(" + \ + " ".join(f[1] + "," if f[0]+1 != no_fields else f[1] for f in enumerate(field_name)) + \ + ") FROM '" + data_input_path + filename + "' WITH (FORMAT 'csv', DELIMITER ',', HEADER, QUOTE '\"'); \n" + \ + 'SELECT * FROM ' + name + ' LIMIT 5; \n' + ) + + print('done, next') + continue + else: + print('No csv files found in directory specified.') + continue + +# write contents list_csv into a file (that can be run to create tables in sql) +with open(output_path + '/Step1C-create-tables.sql', 'w', newline = '') as csvfile: + csvwriter = csv.writer(csvfile, delimiter = '\n', quoting = csv.QUOTE_NONE, escapechar = '\t') + csvwriter.writerow(list_csv) diff --git a/code-for-aurum/Step2-Notebooks/Step2A-intro-cprd-aurum-synthetic-tables.ipynb b/code-for-aurum/Step2-Notebooks/Step2A-intro-cprd-aurum-synthetic-tables.ipynb new file mode 100644 index 0000000..f8ce788 --- /dev/null +++ b/code-for-aurum/Step2-Notebooks/Step2A-intro-cprd-aurum-synthetic-tables.ipynb @@ -0,0 +1,814 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTEBOOK SET UP (1) - ask for credentials and db info from user\n", + "import getpass\n", + "my_username = input('Your username: ')\n", + "my_password = getpass.getpass(prompt='Your password: ', stream=None)\n", + "this_host = input('Host name: ')\n", + "this_db = input('Database name: ')\n", + "\n", + "# NOTEBOOK SET UP (2) - load Jupyter magic functions & connect to db (assumes db & tables already created)\n", + "%load_ext sql\n", + "%sql postgresql+psycopg2://{my_username}:{my_password}@{this_host}/{this_db}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTEBOOK SET UP (3) - ask for necessary paths\n", + "GH_path = input(\"Local path to GH folder 'cprd-data-wrangle': \")\n", + "txt_data_path = input(\"Local path to CPRD Aurum txt files: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction to CPRD Aurum Sample Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The aim of this notebook is provide familiarity with the tables that make up the CPRD Aurum Sample (Synthetic) Dataset.\n", + "\n", + "This notebook assumes you have created a SQL database with the CPRD tables within. See code Step1A, Step1B and Step1C in `code-for-aurum` to see how the raw text files were transformed into tables within a SQL database.\n", + "\n", + "This notebook can also act as a sanity check that you can view and query all the tables in your database. \n", + "\n", + "\n", + "## About the dataset\n", + "\n", + "The [data release notes on CPRD's website](https://www.cprd.com/synthetic-data) summarises the purpose of this synthetic dataset, instructs how to cite it, and presents summary statistics. \n", + "\n", + "Other than this, it points towards the [main Aurum data specifications](https://cprd.com/primary-care-data-public-health-research) for understanding the synthetic data files. This data specification includes the metadata that applies to both the synthetic and the real data (how tables are linked, what tables contain, field descriptions for each table)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List the raw files and their size\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# List the raw files and their size\n", + "\n", + "import os\n", + "import pandas as pd\n", + "\n", + "file_list_df = pd.DataFrame(os.listdir(txt_data_path), columns =['FileName'])\n", + "file_list_df[\"MB\"] = \" \"\n", + "\n", + "for index, row in file_list_df.iterrows():\n", + " this_FileSize = os.path.getsize(txt_data_path + row['FileName'])\n", + " thisFileSize_MB = (this_FileSize / 1024) / 1024\n", + " file_list_df.loc[index][\"MB\"] = round(thisFileSize_MB, 2)\n", + "print(file_list_df)\n", + "\n", + "File_Count = len(file_list_df.index)\n", + "MB_Total = round(file_list_df['MB'].sum(),2)\n", + "print('\\n' + '################################' + '\\nTotal of all ' + str(File_Count) + ' files: ' + str(MB_Total) + ' MB' + '\\n################################')\n", + "print('\\n' + \"These are flat files stored as plain text (.txt).\" \n", + " + '\\n' + \"The real data will be bigger than the synthetic data (GB not MB).\" \n", + " + '\\n' + \"Therefore, the real data may store some text files listed here across multiple files.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## List all tables in this sql database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%sql \n", + "SELECT table_name \n", + "FROM information_schema.tables \n", + "WHERE table_schema='public' AND table_type='BASE TABLE'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preview the data from one table\n", + "\n", + "The notebook will prompt you for the name of the table and the number of rows you want to preview.\n", + "\n", + "Tip: execute the SQL cell more than once for the same table because 'ORDER BY RANDOM()' will show you different data each time.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "table_name = input('Table name: ')\n", + "n_rows = input('N rows to view: ')\n", + "%sql SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT {n_rows} ;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preview the data from all tables\n", + "\n", + "Tip: consider if you want to run this, because it will take some minutes to run and produce a lot of outputs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for index, row in file_list_df.iterrows():\n", + " file_name = row['FileName']\n", + " table_name = file_name.split('.')[0]\n", + " table_preview = %sql SELECT * FROM {table_name} ORDER BY RANDOM() LIMIT 3 ;\n", + " print('\\n' + '## Table ' + str(index) + ' of ' + str(File_Count) + '\\n' + '## This table is ' + table_name)\n", + " display(table_preview)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Detailed exploration - Oct 2021 Release\n", + "\n", + "The code *above* should in theory work for any CPRD data release, as it does not assume anything about the table names or how the tables are linked, and asks for user input.\n", + "\n", + "The code *below* takes a guided and more detailed look at each table. This code will only run for you if your table names match those within the [October 2021 release](https://cprd.com/sites/default/files/2022-02/CPRD%20Aurum%20Sample%20Dataset%20Release%20Notes.pdf) of the CPRD Synthetic Aurum Dataset. The code below assumes information about table linkage that is based on this release date. \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Size comparison: real versus synthetic \n", + "\n", + "Taking the real CPRD Aurum data to be the [May 2022 release](https://cprd.com/sites/default/files/2022-05/2022-05%20CPRD%20Aurum%20Release%20Notes.pdf) and the synthetic CPRD data to be [October 2021 release](https://cprd.com/sites/default/files/2022-02/CPRD%20Aurum%20Sample%20Dataset%20Release%20Notes.pdf):\n", + "\n", + "| Metric | Real | Synthetic | Synthetic % of Real |\n", + "| -| - | - | - |\n", + "| Total Acceptable Patients | 41,200,722 | 39,388 | 0.1% |\n", + "| Total Current Patients | 13,300,067 | 13,858 | 0.1% |\n", + "| Total Practices (current & historic) | 1,491 | 14 | 0.9% |\n", + "\n", + "The table shows that the real dataset has ~1,000 times more patients (total or current) and ~100 times more practices. \n", + "\n", + "Available follow-up time in years since 1st Jan 1995 (mean, sd, percentiles) is similar for the real and synthetic datasets. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## What do the 27 files contain and how do they link togther?\n", + "\n", + "At the time of writing this notebook, [v3.4 of the Aurum data specifications](https://www.cprd.com/sites/default/files/2024-04/CPRD%20Aurum%20Data%20Specification%20v3.4.pdf) describes **8 main data files** and **2 data dictionaries**. The other **17 files are lookup tables** to give values for the fields within the main files. However, the descriptions of the fields within these lookup table are not included in the data specifications. \n", + "\n", + "See the figure on page 5 of [v3.4 of the Aurum data specifications](https://www.cprd.com/sites/default/files/2024-04/CPRD%20Aurum%20Data%20Specification%20v3.4.pdf) which shows how each table is linked with one another and via which ID:\n", + "\n", + "\"cprd-aurum-data-structure\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preview the `Medical dictionary` and associated lookup table\n", + "\n", + "\"The Medical Dictionary contains information on all medical history observations that have been recorded in EMIS Web®. Observations are coded using a combination of SNOMED, Read and local EMIS® codes. Further information is provided in later sections of this document.\" *CPRD Aurum Data Specification Version 3.4*\n", + "\n", + "- Links to the `Consultation` and `Observation` data tables on 'medcodeid'\n", + "- Links to the `EMISCodeCat` lookup table on 'emiscodecategoryid'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- MedicalDictionary table\n", + "SELECT * FROM MedicalDictionary \n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- EMISCodeCat lookup table\n", + "SELECT * FROM EMISCodeCat\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "source": [ + "### Tip: Execute SQL cells more than once because 'ORDER BY RANDOM()' will show you different data each time" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preview the `Product dictionary` \n", + "\n", + "\"The Product Dictionary contains information on drug and appliance prescriptions recorded in EMIS Web®. This information is coded using the Dictionary of Medicines and Devices (DM+D). Further information is provided in later sections of this document.\" *CPRD Aurum Data Specification Version 3.4*\n", + "\n", + "- Links to the `Drug Issue` data table on 'prodcodeid'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- ProductDictionary table\n", + "SELECT * FROM ProductDictionary \n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preview the data tables and associated lookup tables\n", + "\n", + "### `Patient` table \n", + "The `Patient` table \"contains basic patient demographics and patient registration details for the patients.\" *CPRD Aurum Data Specification Version 3.4*\n", + "\n", + "- Links to the `Practice` data table on 'pracid'\n", + "- Links to the `Staff` data table on 'usualgpstaffid'\n", + "- Links to the `Consultation`, `Observation` and `Drug Issue` data tables on 'patid'\n", + "- Links to the `Gender` lookup table on 'gender'\n", + "- Links to the `PatientType` lookup table on 'patienttypeid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Patient table \n", + "SELECT * FROM Patient \n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Gender lookup table\n", + "SELECT * FROM Gender;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- PatientType lookup table\n", + "SELECT * FROM PatientType\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Practice` table\n", + "The `Practice` table \"contains details of each practice, including the practice identifier, practice region, and the last collection date.\" *CPRD Aurum Data Specification Version 3.4*\n", + "\n", + "- Links to the `Patient` data table on 'pracid'\n", + "- Links to the `Region` lookup table on 'region'\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Practice table\n", + "SELECT * FROM Practice\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Region lookup table\n", + "SELECT * FROM Region;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Staff` table\n", + "The `Staff` table contains practice staff details for each staff member, including job category. *CPRD Aurum Data Specification Version 3.4*\n", + "- Links to the `Patient` data table on 'staffid'\n", + "- Links to the `Practice` data table on the 'pracid' \n", + "- Links to the `JobCat` lookup table on 'jobcatid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Staff table \n", + "SELECT * FROM Staff \n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- JobCat lookup table\n", + "SELECT * FROM JobCat\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Consultation` table\n", + "The `Consultation` table \"contains information relating to the type of consultation as entered by the GP (e.g. telephone, home visit, practice visit). Some consultations are linked to observations that occur during the consultation via the consultation identifier (consid).\" *CPRD Aurum Data Specification Version 3.4*\n", + "- Links to the `Patient` data table on 'patid'\n", + "- Links to the `Practice` data table on 'pracid'\n", + "- Links to the `Staff` data table on 'staffid' \n", + "- Links to the `Observation` data table on 'consid'\n", + "- Links to the `MedicaDictionary` table on 'consmedcodeid'\n", + "- Links to the `ConsSource` look up table on 'conssourceid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Consultation table\n", + "SELECT * FROM Consultation \n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- ConsSource lookup table\n", + "SELECT * FROM ConsSource\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Observation` table\n", + "The `Observation` table \"contains the medical history data entered on the GP system including symptoms, clinical measurements, laboratory test results, and diagnoses, as well as demographic information recorded as a clinical code (e.g. patient ethnicity). Observations that occur during a consultation can be linked via the consultation identifier. CPRD Aurum data are structured in a long format (multiple rows per subject), and observations can be linked to a parent observation. For example, measurements of systolic and diastolic blood pressure will be grouped together via a parent observation for blood pressure measurement.\" *CPRD Aurum Data Specification Version 3.4*\n", + "- Links to the `Patient` data table on 'patid'\n", + "- Links to the `Practice` data table on 'pracid'\n", + "- Links to the `Staff` data table on 'staffid'\n", + "- Links to the `Consultation` data table on 'consid'\n", + "- Links to the `Problem` and `Referral` data tables on 'obsid'\n", + "- Links to the `MedicaDictionary` table on 'medcodeid'\n", + "- Links to the `NumUnit` lookup table on 'numunitid'\n", + "- Links to the `ObsType` lookup table on 'obstypeid'\n", + "- Links to itself on 'parentobsid' and 'probobsid'\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Observation table \n", + "SELECT * FROM Observation\n", + "WHERE value IS NOT NULL AND numunitid IS NOT NULL AND numrangelow IS NOT NULL AND numrangehigh IS NOT NULL AND probobsid != 'None'\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- NumUnit lookup table\n", + "SELECT * FROM NumUnit\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- ObsType lookup table\n", + "SELECT * FROM ObsType;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Referral` table\n", + "The `Referral` table \"contains referral details recorded on the GP system. Data in the referral file are linked to the observation file and contain ‘add-on’ data for referral-type observations. These files contain information involving both inbound and outbound patient referrals to or from external care centres (normally to secondary care locations such as hospitals for inpatient or outpatient care). To obtain the full referral record (including reason for the referral and date), referrals should be linked to the Observation file using the observation identifier (obsid).\" *CPRD Aurum Data Specification Version 3.4*\n", + "- Links to the `Patient` data table on 'patid'\n", + "- Links to the `Practice` data table on 'pracid'\n", + "- Links to the `Observation` data table on 'obsid'\n", + "- Links to `RefServiceType` lookup table on 'refservicetypeid'\n", + "- Links to `RefUrgency` lookup table on 'refurgencyid'\n", + "- Links to `OrgType` lookup table on 'refsourceorgid'\n", + "- Links to `RefMode` lookup table on 'refmodeid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Referral table\n", + "SELECT * FROM Referral\n", + "WHERE reftargetorgid IS NOT NULL AND refurgencyid IS NOT NULL AND refservicetypeid IS NOT NULL AND refmodeid IS NOT NULL \n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- RefServiceType lookup table\n", + "SELECT * FROM RefServiceType;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- RefUrgency lookup table\n", + "SELECT * FROM RefUrgency;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- OrgType lookup table\n", + "SELECT * FROM OrgType LIMIT 3;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- RefMode lookup table\n", + "SELECT * FROM RefMode;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `Problem` table\n", + "The `Problem` table \"contains details of the patient’s medical history that have been defined by the GP as a ‘problem’. Data in the problem file are linked to the observation file and contain ‘add-on’ data for problem-type observations. Information on identifying associated problems, the significance of the problem and its expected duration can be found in this table. GPs may use ‘problems’ to manage chronic conditions as it would allow them to group clinical events (including drug prescriptions, measurements, symptom recording) by problem rather than chronologically. To obtain the full problem record (including the clinical code for the problem), problems should be linked to the Observation file using the observation identifier (obsid).\" *CPRD Aurum Data Specification Version 3.4*\n", + "- Links to the `Patient` data table on 'patid'\n", + "- Links to the `Practice` data table on 'pracid'\n", + "- Links to the `Staff` data table on 'lastrevstaffid'\n", + "- Links to the `Observation` data table on 'obsid' and 'parentprobobsid'\n", + "- Links to the `ParentProbRel` lookup table on 'parentprobrelid'\n", + "- Links to the `ProbStatus` lookup table on 'probstatusid'\n", + "- Links to the `Sign` lookup table on 'signid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "--Problem table \n", + "SELECT * FROM Problem\n", + "WHERE lastrevdate IS NOT NULL AND lastrevstaffid IS NOT NULL AND parentprobrelid IS NOT NULL AND probstatusid IS NOT NULL AND signid IS NOT NULL\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- ParentProbRel lookup table\n", + "SELECT * FROM ParentProbRel;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- ProbStatus lookup table\n", + "SELECT * FROM ProbStatus;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Sign lookup table\n", + "SELECT * FROM Sign;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### `DrugIssue` table\n", + "The `DrugIssue` table \"contains details of all prescriptions on the GP system. This file contains data relating to all prescriptions (for drugs and appliances) issued by the GP. Some prescriptions are linked to problem-type observations via the Observation file, using the observation identifier (obsid).\" *CPRD Aurum Data Specification Version 3.4*\n", + "- Links to the `Patient` data table on 'patid'\n", + "- Links to the `Practice` data table on 'pracid'\n", + "- Links to the `Staff` data table on 'staffid'\n", + "- Links to `Observation` and `Problem` data tables on 'probobsid'\n", + "- Links to `ProductDictionary` data dictionary table on 'prodcodeid'\n", + "- Links to the `Common_dosages` lookup table on 'dosageid'\n", + "- Links to the `QuantUnit` lookup table on 'quantunitid'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- DrugIssue table \n", + "SELECT * FROM DrugIssue\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Common_dosages lookup table\n", + "SELECT * FROM Common_dosages\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- QuantUnit lookup table\n", + "SELECT * FROM QuantUnit\n", + "ORDER BY RANDOM() \n", + "LIMIT 5;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py38_default", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code-for-aurum/Step2-Notebooks/Step2B-intro-cprd-aurum-synthetic-stats.ipynb b/code-for-aurum/Step2-Notebooks/Step2B-intro-cprd-aurum-synthetic-stats.ipynb new file mode 100644 index 0000000..4a688f2 --- /dev/null +++ b/code-for-aurum/Step2-Notebooks/Step2B-intro-cprd-aurum-synthetic-stats.ipynb @@ -0,0 +1,416 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTEBOOK SET UP (1) - ask for credentials and db info from user\n", + "import getpass\n", + "my_username = input('Your username: ')\n", + "my_password = getpass.getpass(prompt='Your password: ', stream=None)\n", + "this_host = input('Host name: ')\n", + "this_db = input('Database name: ')\n", + "\n", + "# NOTEBOOK SET UP (2) - load Jupyter magic functions & connect to db (assumes db & tables already created)\n", + "%load_ext sql\n", + "%sql postgresql+psycopg2://{my_username}:{my_password}@{this_host}/{this_db}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Summary statistics for CPRD Aurum Sample (Synthetic) Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The summary statistics created in this notebook follow the structure of those within the ['Release Notes: CPRD Aurum Sample Dataset October 2021'](https://www.cprd.com/sites/default/files/2022-02/CPRD%20Aurum%20Sample%20Dataset%20Release%20Notes.pdf) PDF. \n", + "\n", + "This notebook aims to replicate the numbers that CPRD provides using SQL commands, as an introduction to interacting with this dataset and the tables with SQL.\n", + "\n", + "This notebook assumes you have created a SQL database with the CPRD tables within. See code Step1A, Step1B and Step1C in `code-for-aurum` to see how the raw text files were transformed into tables within a SQL database.\n", + "\n", + "*We have not yet matched all the answers in the data specification - please let us know if you spot why!*\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Total number of acceptable patients (including transferred out and deceased patients)\n", + "Permanent registrations only. The ‘acceptable’ flag refers to a research quality threshold based on CPRD metrics." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Count total acceptable patients\n", + "SELECT COUNT(*)\n", + "FROM patient\n", + "WHERE acceptable = 1;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Current number of acceptable patients (i.e. registered at currently contributing practices, excluding transferred out deceased patients)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT COUNT(*)\n", + "FROM patient\n", + "WHERE acceptable = 1 \n", + "AND cprd_ddate IS NULL -- The data spec suggests to use cprd_ddate instead of emis_ddate \n", + "AND regenddate IS NULL; -- regenddate is null means no date of registration ending " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Percentage of UK population coverage (current patients only)\n", + "Based on latest UK population estimates from the Office of National Statistics.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT COUNT(*)/667968.00 as percent_coverage\n", + "FROM patient\n", + "WHERE acceptable = 1\n", + "AND cprd_ddate IS NULL\n", + "AND regenddate IS NULL;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Available follow-up time in years since 1st January 1995 (all patients including transferred out and deceased):\n", + "Follow-up time stated here does not incorporate the up-to-standard (UTS) date and the database includes records pre-dating the 1st of January 1995\n", + "\n", + "*In this section, we don't quite match the answers in the release note!*\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Defining follow up time as difference between enddate and startdate\n", + "SELECT regenddate,\n", + "regstartdate,\n", + "regenddate-regstartdate AS followup_days,(regenddate-regstartdate)/365.0 AS followup_years\n", + "FROM Patient\n", + "WHERE regenddate IS NOT NULL\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- AVERAGE for all patients\n", + "SELECT AVG(\n", + " (\n", + " CASE WHEN regenddate IS NULL \n", + " THEN '2021-10-01' ELSE regenddate END -- if patient had no regenddate we assume the enddate is the date of cprd publication\n", + " - \n", + " CASE WHEN regstartdate < '1995-01-01' -- we want to include all patients, but if regstartdate is before 1995-01-01, we only count from this date\n", + " THEN '1995-01-01' ELSE regstartdate END\n", + " )/365.0\n", + " )\n", + "FROM Patient;\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- MEDIAN for all patients\n", + "WITH cte AS (\n", + " SELECT \n", + " (CASE WHEN regenddate IS NULL \n", + " THEN '2021-10-01' ELSE regenddate END \n", + " - \n", + " CASE WHEN regstartdate < '1995-01-01'\n", + " THEN '1995-01-01' ELSE regstartdate END\n", + " )/365.0 AS followup_years\n", + " FROM Patient\n", + " )\n", + " --select * from cte \n", + " SELECT percentile_disc(0.25) WITHIN group (ORDER BY followup_years) FROM cte\n", + " UNION ALL\n", + " SELECT percentile_disc(0.5) WITHIN group (ORDER BY followup_years) FROM cte\n", + " UNION ALL \n", + " SELECT percentile_disc(0.75) WITHIN group (ORDER BY followup_years) FROM cte" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "---STDEV for all patients\n", + "SELECT STDDEV(\n", + " (\n", + " CASE WHEN regenddate IS NULL \n", + " THEN '2021-10-01' ELSE regenddate END \n", + " - \n", + " CASE WHEN regstartdate < '1995-01-01' \n", + " THEN '1995-01-01' ELSE regstartdate END\n", + " )/365.0\n", + " )\n", + "FROM Patient;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- AVERAGE for current patients\n", + "SELECT AVG(\n", + " (\n", + " CASE WHEN regenddate IS NULL \n", + " THEN '2021-10-01' ELSE regenddate END \n", + " - \n", + " CASE WHEN regstartdate < '1995-01-01'\n", + " THEN '1995-01-01' ELSE regstartdate END\n", + " )/365.0\n", + " )\n", + "FROM Patient\n", + "WHERE regenddate IS NULL\n", + "AND cprd_ddate IS NULL;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- MEDIAN for all patients\n", + "WITH cte AS (\n", + " SELECT \n", + " (CASE WHEN regenddate IS NULL \n", + " THEN '2021-10-01' ELSE regenddate END \n", + " - \n", + " CASE WHEN regstartdate < '1995-01-01'\n", + " THEN '1995-01-01' ELSE regstartdate END\n", + " )/365.0 AS followup_years\n", + " FROM Patient\n", + " WHERE regenddate IS NULL\n", + " AND cprd_ddate IS NULL\n", + " )\n", + " --select * from cte \n", + " SELECT percentile_disc(0.25) WITHIN group (ORDER BY followup_years) FROM cte\n", + " UNION ALL\n", + " SELECT percentile_disc(0.5) WITHIN group (ORDER BY followup_years) FROM cte\n", + " UNION ALL \n", + " SELECT percentile_disc(0.75) WITHIN group (ORDER BY followup_years) FROM cte\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- STDEV for current patients \n", + "SELECT STDDEV(\n", + " (\n", + " CASE WHEN regenddate IS NULL \n", + " THEN '2021-10-01' ELSE regenddate END \n", + " - \n", + " CASE WHEN regstartdate < '1995-01-01'\n", + " THEN '1995-01-01' ELSE regstartdate END\n", + " )/365.0\n", + " )\n", + "FROM Patient\n", + "WHERE regenddate IS NULL\n", + "AND cprd_ddate IS NULL;\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Total number of practices (current and historic) included in the database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT COUNT(*) FROM practice;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Total number of distinct practices \n", + "SELECT COUNT(DISTINCT practice) FROM practice;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Currently contributing practices\n", + "\n", + "> Code to come!\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Percentage coverage of UK general practices (currently contributing practices only)\n", + "Expressed as a percentage of all practices currently contributing to CPRD Aurum\n", + "\n", + "> Code to come! " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Regional distribution of currently contributing practices\n", + "\n", + "*In this section, we don't quite match the answers in the release note!*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Regional distribution of currently contributing practices\n", + "SELECT re.description AS Region, COUNT(pr.pracid) AS TotalPractices \n", + "FROM practice PR\n", + "INNER JOIN region re \n", + "ON re.regionid = pr.region\n", + "GROUP BY pr.region, re.description" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py38_default", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code-for-aurum/Step2-Notebooks/Step2C-cprd-cohort-notebooks.ipynb b/code-for-aurum/Step2-Notebooks/Step2C-cprd-cohort-notebooks.ipynb new file mode 100644 index 0000000..4671b4b --- /dev/null +++ b/code-for-aurum/Step2-Notebooks/Step2C-cprd-cohort-notebooks.ipynb @@ -0,0 +1,896 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CPRD Cohort Criteria Examples Workbook" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This workbook was created to replicate the example criteria given in CPRD Aurum FAQs v2.4 (see their [website](https://www.cprd.com/primary-care-data-public-health-research)) under the section *\"How will I know if the CPRD Aurum data are suitable for my research needs?\".* When we include text within quotation marks in the sections below, we are quoting from CPRD Aurum FAQs v2.4. \n", + "\n", + "We are using these examples to increase our understanding of the tables, how they all relate to eachother and how to write queries for example criteria. These types of queries would allow a research team to filter the CPRD data, to create a sample cohort that matches their research questions e.g. select patients within a certain age range and on a specific medication. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----\n", + "Preliminary setup code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# NOTEBOOK SET UP (1) - ask for credentials and db info from user\n", + "import getpass\n", + "my_username = input('Your username: ')\n", + "my_password = getpass.getpass(prompt='Your password: ', stream=None)\n", + "this_host = input('Host name: ')\n", + "this_db = input('Database name: ')\n", + "\n", + "# NOTEBOOK SET UP (2) - load Jupyter magic functions & connect to db (assumes db & tables already created)\n", + "%load_ext sql\n", + "%sql postgresql+psycopg2://{my_username}:{my_password}@{this_host}/{this_db}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Below we've written queries for each example criterion step-by-step, hopefully making the complete query easier to understand." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Part 1. Counts based on one criterion:\n", + "\n", + "*\"The total number of patients in CPRD GOLD or CPRD Aurum with a first ever prescription for\n", + "metformin recorded during 01/01/2004 - 31/12/2015, stratified by calendar year\"*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "a. Find all patients on metformin prescription" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- FILTERING all terms relating to metformin from product dictionary into a new table called A_MetforminProdID\n", + "DROP TABLE IF EXISTS A_MetforminProdID;\n", + "SELECT * INTO A_MetforminProdID FROM ProductDictionary \n", + "WHERE (\n", + " UPPER(DrugSubstanceName) LIKE '%METFORMIN%' -- input terms to include here\n", + " );\n", + " \n", + "-- Drug issue table contains data relating to all drug prescriptions issued by the GP, \n", + "-- so joining this table to the filtered product dictionary, we'll get all metformin related prescriptions\n", + "SELECT DI.*, '<----->' AS BOUNDARY, MP.* FROM DrugIssue DI\n", + "INNER JOIN A_MetforminProdID MP ON MP.prodcodeID = DI.prodcodeID\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "source": [ + "b. Now we have a way of finding all patients with metformin prescriptions issued, we want to find the very first metformin prescription issued, grouped by patient: \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Grouping metformin prescription for each patient\n", + "SELECT patid, issuedate FROM DrugIssue DI\n", + "INNER JOIN A_MetforminProdID MP ON MP.prodcodeID = DI.prodcodeID\n", + "GROUP BY patid, issuedate\n", + "ORDER BY issuedate ASC\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Insert above grouping into a new table called A_MetforminPatients:\n", + "DROP TABLE IF EXISTS A_MetforminPatients;\n", + "SELECT patid, issuedate INTO A_MetforminPatients \n", + "FROM DrugIssue DI\n", + "INNER JOIN A_MetforminProdID MP ON MP.prodcodeID = DI.prodcodeID\n", + "--WHERE patid = 'XXXXXXXX' -- to look at single patient sample\n", + "GROUP BY patid, issuedate\n", + "ORDER BY issuedate ASC\n", + "\n", + "-- Ordering to find first metformin prescription for each patient\n", + ";WITH added_row_number AS (\n", + " SELECT\n", + " *,\n", + " ROW_NUMBER() OVER(PARTITION BY patid ORDER BY issuedate ASC) AS row_number\n", + " FROM A_MetforminPatients\n", + ")\n", + "SELECT\n", + " *\n", + "FROM added_row_number\n", + "--WHERE row_number = 1 --AND patid = 'XXXXXXXX' -- single patient sample\n", + "LIMIT 20;" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "source": [ + "c. We have the first metformin prescriptions issued for each patient, so now we want to filter out the those within the date interval given: 01/01/2004 - 31/12/2015 \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "WITH added_row_number AS (\n", + " SELECT\n", + " *,\n", + " ROW_NUMBER() OVER(PARTITION BY patid ORDER BY issuedate ASC) AS row_number\n", + " FROM A_MetforminPatients\n", + ")\n", + "SELECT\n", + " *\n", + "FROM added_row_number\n", + "WHERE row_number = 1 --AND patid = 'XXXXXXXXXX'\n", + "AND issuedate BETWEEN '2004-01-01' AND '2015-12-31' \n", + "LIMIT 3;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "d. The last condition is to stratify by calendar year, which we'll do below:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- 4. Group by calendar year\n", + "WITH added_row_number AS (\n", + " SELECT\n", + " *,\n", + " ROW_NUMBER() OVER(PARTITION BY patid ORDER BY issuedate ASC) AS row_number\n", + " FROM A_MetforminPatients\n", + ")\n", + "SELECT\n", + " DATE_PART('YEAR',issuedate), count(patid)--, issuedate\n", + "FROM added_row_number\n", + "WHERE row_number = 1 --AND patid = 'XXXXXXXXXX'\n", + "AND issuedate BETWEEN '2004-01-01' AND '2015-12-31' \n", + "GROUP BY DATE_PART('YEAR',issuedate)--,patid, issuedate\n", + "LIMIT 8;\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above makes use of the product dictionary, and a sample string ('metformin') to filter out relevant drugs. You may however already have a list of drugs that you'd like to load in directly and use to filter out patients, for which we'll have an example below." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "----" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Part 2. Counts based on two criteria:\n", + "\n", + "\n", + "\n", + "1) \"Total number of patients with a medical diagnosis of Type 2 diabetes mellitus recorded in CPRD GOLD or HES APC \n", + "on or before 31/12/2005\" \n", + "OR\n", + "2) \"Prescriptions for anti-diabetic medication in CPRD GOLD (note - provided in one code list) \n", + "on or before 31/12/2005. Patients must have at least 12 months of prior registration before their earliest event date.\"\n", + "\n", + "Rephrased criteria: \n", + "\n", + "1) Total number of patients with a medical diagnosis of Type 2 diabetes mellitus recorded in CPRD Aurum on or before 31/12/2005 \n", + "OR\n", + "2) Prescriptions for anti-diabetic medication in CPRD Aurum (note - provided in one code list) on or before 31/12/2005. \n", + "Patients must have at least 12 months of prior registration before their earliest event date.\n", + "\n", + "We'll split up both criteria (1 & 2) and start with the first one, in segments, below:\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1a. Load in pre-existing code list for Type2Diabetes, CPRD Aurum, from OPTIMAL's THINK repo (or any other of your own choice)\n", + "\n", + "link here: https://github.com/THINKINGGroup/phenotypes/tree/main/Medical%20conditions/Type2Diabetes_11_3_21_birm_cam\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Specify file path for your downloaded code list file (for example: User/Docs/codelists/type2diabetes.csv)\n", + "code_list_dir = input('Enter path for type 2 diabetes code list file')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Create a new table to load list into (make sure this matches the headers in your code list file!)\n", + "DROP TABLE IF EXISTS A_Type2Diabetes_CodeLists;\n", + "CREATE TABLE A_Type2Diabetes_CodeLists\n", + "(MEDICAL_CODE_ID TEXT,DESCRIPTION TEXT,READ_CODE TEXT,SNOMED_CT_CODE TEXT,DATASOURCE TEXT);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Loading in code list from directory specified above\n", + "DO $$ \n", + "DECLARE\n", + " filepath TEXT := :code_list_dir;\n", + "BEGIN\n", + " EXECUTE 'COPY A_Type2Diabetes_CodeLists (MEDICAL_CODE_ID,DESCRIPTION,READ_CODE,SNOMED_CT_CODE,DATASOURCE) FROM ''' || filepath || ''' WITH (FORMAT ''csv'', DELIMITER '','', HEADER, QUOTE ''\"'');';\n", + "END $$;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- View table contents \n", + "SELECT * FROM A_Type2Diabetes_CodeLists LIMIT 2;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Use code list to filter medcodeids in medical dictionary for Type2Diabetes\n", + "SELECT * FROM medicaldictionary\n", + "WHERE medcodeid IN (SELECT medical_code_id FROM A_Type2Diabetes_CodeLists)\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1b. Find all type 2 diabetes related observations" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- View patients with type2db medcodeid in observation table\n", + "SELECT * FROM observation \n", + "WHERE medcodeid IN (SELECT medical_code_id FROM A_Type2Diabetes_CodeLists)\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "--Total observations related to type2db\n", + "(SELECT 'Total obs related to typ2db' AS Totals, count(*) FROM observation\n", + "WHERE medcodeid IN (SELECT medical_code_id FROM A_Type2Diabetes_CodeLists))\n", + "UNION\n", + "-- Total distinct patients with observations related to/medical diagnosis of Type 2 diabetes mellitus \n", + "(SELECT 'Total distinct patients with obs related to typ2db', count(DISTINCT patid) FROM observation\n", + "WHERE medcodeid IN (SELECT medical_code_id FROM A_Type2Diabetes_CodeLists));" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "1c. Medical diagnosis (inferred by a medcodeid occurence) recorded on or before 31/12/2005:\n", + "\n", + " Note to check assumptions of methodology below: we might want to join the problem table and consider the parent problem label - we did it manually for now. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- Find all type2d observations grouped (least to most recent, ascending) for each patient\n", + "SELECT patid,obsdate,medcodeid,t.description FROM observation o \n", + "INNER JOIN A_Type2Diabetes_CodeLists t ON t.medical_code_id = o.medcodeid \n", + "WHERE medcodeid IN (SELECT medical_code_id FROM A_Type2Diabetes_CodeLists)\n", + "GROUP BY patid,obsdate,medcodeid, t.description\n", + "ORDER BY patid,obsdate ASC\n", + "LIMIT 5;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- Find earliest observation and add constraint for observation date\n", + "WITH patid_ranked AS (\n", + "SELECT *, \n", + "ROW_NUMBER() OVER(PARTITION BY patid ORDER BY obsdate ASC) AS row_number\n", + "FROM observation WHERE medcodeid IN (SELECT medical_code_id FROM A_Type2Diabetes_CodeLists)\n", + ")\n", + "SELECT * FROM patid_ranked\n", + "WHERE row_number = 1 -- most recent observation when ordered by obsdate ascending\n", + "AND obsdate <= '2005-12-31' -- add date constraint for the first observation \n", + "LIMIT 3; \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll look at the second criteria:\n", + "\n", + "2. Prescriptions for anti-diabetic medication in CPRD Aurum (note - provided in one code list) on or before 31/12/2005. \n", + "Patients must have at least 12 months of prior registration before their earliest event date.\n", + "\n", + "Edit: date modified from 31/12/2005 to 31/12/2015 as this was a more suitable range for the synthetic Aurum dataset. If you are running this notebook on the real Aurum dataset, try the original date threshold. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2a. Load in multiple code lists from OPTIMAL's THINK repo (or any resource of your own preference) that relate to anti-diabetic medications.\n", + "\n", + "links here: \n", + "- https://github.com/THINKINGGroup/phenotypes/blob/main/Drug%20Codes\n", + "- https://phenotypes.healthdatagateway.org/phenotypes/\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- Create a new filename table, as there are multiple antidiabetic drug files\n", + "DROP TABLE IF EXISTS A_AntiDiabeticDrug_CodeList_Name;\n", + "CREATE TABLE A_AntiDiabeticDrug_CodeList_Name(FILEID INT GENERATED BY DEFAULT AS IDENTITY, DRUGFILENAME TEXT);\n", + "SELECT * FROM A_AntiDiabeticDrug_CodeList_Name;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "## Change directory here to the folder containing the code lists:\n", + "directory_path = input('Specify directory for folder containing the diabetic medication code lists')\n", + "\n", + "# Iterate over all the code lists in the specified directory\n", + "for filename in os.listdir(directory_path):\n", + " file_path = os.path.join(directory_path, filename)\n", + " if os.path.isfile(file_path):\n", + " # Insert filenames into the table\n", + " %sql INSERT INTO A_AntiDiabeticDrug_CodeList_Name (DRUGFILENAME) VALUES (:filename)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Create a new antidiabetic medication code list table (note this should match your code list file headers):\n", + "DROP TABLE IF EXISTS A_AntiDiabeticDrug_CodeLists;\n", + "CREATE TABLE A_AntiDiabeticDrug_CodeLists\n", + "(DRUG_CODE_ID TEXT,DESCRIPTION TEXT,BNF1 TEXT,BNF2 TEXT,BNF3 TEXT,ATC TEXT,DATABASE TEXT);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create function to load in data below, which we will then use to loop over multiple code lists\n", + "\n", + "*You will need to change the values according to your own code list file contents \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "-- Function to load code lists into table in a LOOP\n", + "CREATE OR REPLACE FUNCTION A_loaddata(filepathname TEXT)\n", + " RETURNS void\n", + " LANGUAGE plpgsql AS\n", + "$func$\n", + "BEGIN\n", + " EXECUTE format ('\n", + " COPY A_AntiDiabeticDrug_CodeLists(DRUG_CODE_ID,DESCRIPTION,BNF1,BNF2,BNF3,ATC,DATABASE) -- more columns\n", + " FROM %L (FORMAT CSV, HEADER)' -- modern syntax\n", + " -- WITH CSV HEADER' -- tolerated legacy syntax\n", + " , $1); -- pass 1st function parameter (filepathname) to format() \n", + "END\n", + "$func$;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, loop over drug list filenames and apply loaddata() function:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- SELECT * FROM A_AntiDiabeticDrug_CodeList_Name; -- contains filenames\n", + "-- SELECT * FROM A_AntiDiabeticDrug_CodeLists; -- table to insert into\n", + "\n", + "--Loop loaddata() function over drug list filenames to insert into table:\n", + "do $$\n", + "DECLARE fileName VARCHAR;\n", + "BEGIN\n", + "FOR fileName IN \n", + "SELECT drugfilename FROM A_AntiDiabeticDrug_CodeList_Name\n", + "LOOP\n", + " PERFORM A_loaddata(\n", + " :directory_path || '/' || fileName\n", + " );\n", + "END LOOP;\n", + "END;\n", + "$$;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "--Check table for loaded data:\n", + "SELECT * FROM A_AntiDiabeticDrug_CodeLists\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2b. Now we want to use the antidiabetic code lists table to filter out the relevant drugs from dictionary (or we can use code list table directly, since it already has cprd aurum identifiers if it's a complete match)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- a. Use code list to filter prodcodeids in product dictionary for anti-diabetic medication\n", + "SELECT * FROM productdictionary \n", + "WHERE PRODCODEID IN (SELECT drug_code_id FROM A_AntiDiabeticDrug_CodeLists)\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- You can also check if there's a count match between all the codes in your code list vs. codes in the product dictionary table as a sanity check\n", + "(SELECT COUNT(DISTINCT drug_code_id) FROM A_AntiDiabeticDrug_CodeLists) --\n", + "UNION ALL\n", + "(SELECT count(*) FROM productdictionary \n", + "WHERE PRODCODEID IN (SELECT drug_code_id FROM A_AntiDiabeticDrug_CodeLists)\n", + ");" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2c. Find patient observations relating to antidiabetic drugs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "SELECT DI.*, ADC.* FROM drugissue DI \n", + "INNER JOIN A_AntiDiabeticDrug_CodeLists ADC ON DI.prodcodeid = ADC.drug_code_id\n", + "WHERE DI.issuedate <= '2015-12-31'\n", + "ORDER BY enterdate ASC\n", + "LIMIT 2;\n", + "\n", + "-- Total patient prescriptions with antidiabetic drugs\n", + "SELECT COUNT(DISTINCT patid) FROM drugissue DI \n", + "INNER JOIN A_AntiDiabeticDrug_CodeLists ADC ON DI.prodcodeid = ADC.drug_code_id\n", + "WHERE DI.issuedate <= '2015-12-31'\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "2d. Patients must have at least 12 months of prior registration before their earliest event date (earliest prescription).\n", + "\n", + "So below we have a table with earliest prescription issue date for each patient on antidiabetic drugs, where patients also have AT LEAST a one year interval between first issue of metformin and registration date at GP clinic \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "-- Table with earliest prescription issue date \n", + "-- for each patient on antidiabetic drugs\n", + "WITH patid_ranked AS (\n", + " SELECT DI.*, \n", + " ROW_NUMBER() OVER(PARTITION BY patid ORDER BY issuedate ASC) AS row_number,\n", + " ADC.*\n", + " FROM drugissue DI \n", + " INNER JOIN A_AntiDiabeticDrug_CodeLists ADC ON DI.prodcodeid = ADC.drug_code_id\n", + " WHERE DI.issuedate <= '2015-12-31'\n", + " )\n", + "SELECT P.regstartdate,R.issuedate,R.issuedate - P.regstartdate AS DIFF, R.issuedate - interval '1 year' AS IssueDateMinus1Year,R.* FROM patid_ranked R\n", + " INNER JOIN Patient P ON P.patid = R.patid\n", + " WHERE row_number = 1\n", + " AND P.regstartdate <= (R.issuedate - interval '1 year')\n", + " ORDER BY DIFF ASC\n", + " LIMIT 3;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "------\n", + "\n", + "#### Part 3. Counts based on three criteria:\n", + "1) The total number of patients in CPRD Aurum with an incident medical\n", + "diagnosis of Type 2 diabetes mellitus recorded in during 01/01/2004 - 31/12/2015 OR\n", + "2) Incident prescription of anti-diabetic medication (note-provided in one code list) documented during\n", + "01/01/2004 - 31/12/2015 AND\n", + "3) Have a test record for HbA1c recorded in CPRD Aurum (note - test value not assessed)\n", + "\n", + "First and second criteria have already been previously covered; here we focus on the third criteria only.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3a. Find all terms related to '%HbA1%' and insert into a new table called A_hba1_medcodes\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql \n", + "--SELECT * FROM Observation LIMIT 10;\n", + "--SELECT * FROM obstype ORDER BY 1; -- lookup table\n", + "DROP TABLE IF EXISTS A_hba1_medcodes;\n", + "SELECT * INTO A_hba1_medcodes FROM medicaldictionary\n", + "WHERE term LIKE '%HbA1%';\n", + "--View table values:\n", + "SELECT * FROM A_hba1_medcodes LIMIT 2;\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3b. Finding all patients with HbA1C related observations, where the observation type is a 'value' and the value is populated (not a NULL value)\n", + "\n", + "*Note, assumption made below, that 'value' obstype gives the test record\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "\n", + "-- if we want to see all the distinct observation types \n", + "SELECT DISTINCT obs.obstypeid FROM Observation obs\n", + "INNER JOIN obstype ot ON ot.obstypeid = obs.obstypeid\n", + "WHERE obs.medcodeid IN (SELECT medcodeid FROM A_hba1_medcodes)\n", + "LIMIT 2;\n", + "\n", + "--checking what different ids correspond to\n", + "SELECT * FROM numunit;\n", + "\n", + "--Joining tables to see observations for patients where \n", + "--type of observation is a 'value' type, not null, \n", + "-- and related to 'hba1c' term in dictionary \n", + "SELECT obs.*, ot.*, md.* FROM Observation obs\n", + "INNER JOIN obstype ot ON ot.obstypeid = obs.obstypeid\n", + "INNER JOIN medicaldictionary md ON md.medcodeid = obs.medcodeid\n", + "WHERE obs.medcodeid IN (SELECT medcodeid FROM A_hba1_medcodes)\n", + "AND ot.obstypeid = 10 -- value\n", + "AND obs.value IS NOT NULL\n", + "LIMIT 2;" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "3c. We can also now count the distinct number of patients:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "--count distinct patients:\n", + "SELECT COUNT(DISTINCT obs.patid) FROM Observation obs\n", + "INNER JOIN obstype ot ON ot.obstypeid = obs.obstypeid\n", + "INNER JOIN medicaldictionary md ON md.medcodeid = obs.medcodeid\n", + "WHERE obs.medcodeid IN (SELECT medcodeid FROM A_hba1_medcodes)\n", + "AND ot.obstypeid = 10 -- value\n", + "AND obs.value IS NOT NULL\n", + "LIMIT 2;\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "source": [ + "------\n", + "\n", + "#### Drop tables created in this notebook:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "sql" + } + }, + "outputs": [], + "source": [ + "%%sql\n", + "DROP TABLE IF EXISTS a_antidiabeticdrug_codelist_name;\n", + "DROP TABLE IF EXISTS a_antidiabeticdrug_codelists;\n", + "DROP TABLE IF EXISTS a_metforminpatients;\n", + "DROP TABLE IF EXISTS a_metforminprodid;\n", + "DROP TABLE IF EXISTS a_type2diabetes_codelists;\n", + "DROP TABLE IF EXISTS a_hba1_medcodes;\n", + "DROP FUNCTION IF EXISTS a_loaddata;" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "py38_default", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.16" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/code-for-aurum/Step2-Notebooks/cprd-aurum-data-structure.png b/code-for-aurum/Step2-Notebooks/cprd-aurum-data-structure.png new file mode 100644 index 0000000..1d8f6ca Binary files /dev/null and b/code-for-aurum/Step2-Notebooks/cprd-aurum-data-structure.png differ diff --git a/code-for-aurum/installation-setup.md b/code-for-aurum/installation-setup.md new file mode 100644 index 0000000..6d2db74 --- /dev/null +++ b/code-for-aurum/installation-setup.md @@ -0,0 +1,147 @@ +# Software installation + +Step 1 of the workflow uses **Python** and **PostgreSQL**. To create this workflow, we used a [Data Science Virtual Machine (DSVM)](https://azure.microsoft.com/en-gb/products/virtual-machines/data-science-virtual-machines)(Ubuntu 20.04) which already had Python and PostgreSQL installed. + +- **Python**: the installation of Python will depend on your operating system, reference [the main Python docs](https://www.python.org) for more details. The python files (`.py` below) were created with Python 3.8, using the Python extension within [Visual Studio Code](https://code.visualstudio.com). + +- **PostgreSQL**: On the DSVM, it was likely installed by `sudo apt install postgresql postgresql-contrib`. Reference [the main PostgreSQL docs](https://www.postgresql.org/download) for more details on how to install on your operating system. + +It is likely that your research institution will already have their existing infrastructure and compulsory/recommended software set-ups. If not, we include some extra information in the next section, to cover: +- how we configured PostgreSQL for our set-up and created users with different permissions +- how we executed sql commands via the psql terminal, a sql script, and a shell script +- how we used Visual Studio Code to provide a graphical user interface to execute sql commands, as well as our favoured way to connect to the VM via the ssh connection +- how we integrate PostgreSQL with Jupyter Notebook + +# PostgreSQL on Ubuntu DSVM + +## Configure & create users +Resources: [1-Azure Quick Start](https://learn.microsoft.com/en-us/samples/azure/azure-quickstart-templates/postgresql-standalone-server-ubuntu/) and [2-PostgreSQL on DSVM](https://learn.microsoft.com/en-us/azure/machine-learning/data-science-virtual-machine/linux-dsvm-walkthrough?view=azureml-api-2#postgresql-and-squirrel-sql) + +1. Check configuration files for postgresql (resource 2) +> They talk about `/var/lib/pgsql/data/pg_hba.conf` but ours was here: `/etc/postgresql/12/main/pg_hba.conf` +> They say to *'Change the IPv4 local connections line to use md5 instead of ident, so we can log in by using a username and password.'* This was already done for our files, so did not need to change anything. +2. Launch psql, the interactive terminal for PostgreSQL, as the built-in postgres user: + +`sudo su - postgres psql` + +3. As postgres user, check you can create a database, create table, and insert into this table (resource 1) + +5. Create a user account (resource 2), and assign the appropriate [roles](https://www.postgresql.org/docs/current/predefined-roles.html) and [privileges](https://www.postgresql.org/docs/current/ddl-priv.html). + +7. Quit the interactive terminal with `\q` +> It said could not save history to file `"/var/lib/postgresql/.psql_history": No such file or directory`. It's trying to save the history of what we've done so far in psql as user the 'postgres'. Fortunately this does not seem like it really needs immediate solving because when we go on to use psql with our user accounts, it saves the history here `/home/your-username/.psql_history`. + +6. Write `su your_username` to get back to user profile + +7. Log back into the the interactive psql terminal by writing `psql` in the terminal and checking that your desired default username is being used + +## Exploring PostgreSQL file layout on the DSVM + +Using [this documentation](https://www.postgresql.org/docs/current/storage-file-layout.html) and cross referencing with our set-up on the DSVM: + +Cluster configuration files are here: +> etc/postgresql/12/main/ + +The docs talk of a PGDATA directory where data files used by the database cluster are stored. We don't have anything called that but it seems to be here: +> ls /var/lib/postgresql/12/main/ + +We also have postgresql files in these locations: +> /usr/share/postgresql/12/ +> /usr/share/postgresql-common/ + +And, as expected, there are executables in `/bin` which start with `pg_` + +## Test psql commands + +:bulb: A cheat sheet of helpful commands: https://tomcam.github.io/postgres/#using-psql + +### Interactive psql terminal + +When logged in to psql terminal with your username, try these commands: + + - `CREATE DATABASE your_database;` + - `\c your_database` + - `CREATE TABLE products (product integer, name text, price numeric);` + - `INSERT INTO products VALUES (1, 'Cheese',4.99);` + - `SELECT * FROM products;` + - Make a csv file with multiple rows, with these columns `n, name, price` + - `COPY products (n, name, price) FROM 'test_files/products.csv' WITH (FORMAT 'csv', HEADER, DELIMITER ',');` + - `SELECT * FROM products;` + +### Execute from sql script +- Create a new file on the shell terminal `touch test.sql` +- Paste the above commands that you ran on the psql terminal into the test.sql file +- Log into psql `psql` +- Run the sql script, with outputs displayed: `\i test.sql` +- Run the sql script, with outputs pasted into a text file: `\o output.txt` `\i test.sql` + +### Execute from shell script +- Create a new file on the shell terminal `touch test_sql.sh` +- Inside this test_sql.sh file write: + +> #!/bin/sh +> +> dbname="database-name-that-exists" +> +> username="your-username" +> +> psql $dbname $username << EOF +> +> SELECT * FROM table-that-exists; +> +> EOF + +All the postgreSQL commands are provided inside the EOF block. You can also use shell variables inside the sql command e.g. + +`wherecond="something"` + +`SELECT * FROM table-that-exists WHERE col_name = '$wherecond';` + +## ssh to the DSVM with VSCode (ignore if no VM being used) + +On our local MacOS we installed *Remote Explorer* and *Remote - ssh* by Microsoft. Instead of using an ssh connection via the terminal app, we can ssh connect to the VM via our local [Visual Studio (VS) Code](https://code.visualstudio.com/) app, with these extensions. + +**After installing extensions:** +- Note a small green box in bottom left of VSCode, with the '**><**' icon +- Click on **><** to open a remote window +- In the dialog box, select Connect to Host (Remote SSH) > Add new SSH host +- Enter ssh connection details - it needed username, server IP and private key. We located and selected the SSH configuration file previously created. This added the host to our configuration file. +- Select **><** , Connect to Host, and select the server IP address from the drop down. This prompt us to enter a password +- Once done, a new remote window will open. You can confirm this is connected to the host as the >< icon will change in the bottom left + +## Visual Studio Code to interact with PostgreSQL + +As an alternative to using PostgreSQL on terminal only, we can use an extension to VS Code to give us a graphical user interface (GUI). There are other GUIs available! + +We installed the *PostgreSQL Management Tool* extension by *Chris Kolkman* in two locations: on VSCode on the VM (access via remote desktop) and on VSCode on our local MacOS. + +When in the postgreSQL explorer on VS Code, press the + button to add a database connection. +- Hostname was 'localhost' +- Put in postgreSQL credentials (created earlier) +- Kept the default port +- Chose a secure connection + +## PostgreSQL Integration with Jupyter Notebook + +These python libraries must be installed: ipython-sql, sqlalchemy, Psycopg2 + +Then run this in a Python cell: + +`` +%load_ext sql +from sqlalchemy import create_engine +`` + +Connecting to a PostgreSQL database run: + +`` +%sql dialect+driver://username:password@host:port/database +`` + +For example: + +`` +%sql postgresql://postgres:password123@localhost/mydatabase +`` + + diff --git a/code-for-aurum/workflow_idea.png b/code-for-aurum/workflow_idea.png new file mode 100644 index 0000000..70180d4 Binary files /dev/null and b/code-for-aurum/workflow_idea.png differ diff --git a/cprd-code-browser.md b/cprd-code-browser.md new file mode 100644 index 0000000..2c95a71 --- /dev/null +++ b/cprd-code-browser.md @@ -0,0 +1,13 @@ +## CPRD's Code Browser + +👉 Refer to [the CPRD website](https://cprd.com/defining-your-study-population) for information on CPRD's Code Browser tool, particularly the *'How to find the codes used in primary care data'* section, which links to a user guide. + +👉 A researcher can request access to the Code Browser tool for free by emailing *enquiries@cprd.com*. + +The Code Browser software is designed for Windows OS only. + +## Why use the Code Browser? + +- It contains additional information to the data specification PDFs as it gives you access to the medical and product dictionaries, allowing you to browse and create code lists that can be used to build your cohort. +- It contains some observation counts (for real data, not synthetic). These counts are rounded to one significant figure for disclosure control. However, the observation counts only show the number of times each code has been used in the Observation file, and not the number of patients who have that code in their record (one patient may have same code multiple times during different consultations). +- For a patient count for a specific condition, CPRD first recommends looking at publications in your area of interest in their bibliography, otherwise you can request a feasible count from CPRD based on inclusions/exclusion criteria.