diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..bc1ff7f --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1 @@ +# TODO QQQQ \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..63b7205 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,56 @@ +# We want to run CI processes that can run independent of databricks as branch rules so that we dont # deploy at cost code that we already should know needs changing +# such as linting, and unit test for python, maybe dab? verify +# we run these on all pull request because if there is a hot fix it may not have passed through +# staging for example +# qqqq check this is up to date +name: CI - Pull Request Checks + +# Run CI on all pull requests +on: + pull_request: + branches: + - '**' # all branches + +jobs: + ci_checks: + name: "Linting, Unit Tests, DAB Verify" + runs-on: ubuntu-latest + + steps: + # Checkout code + - name: Checkout repository + uses: actions/checkout@v4 + + # Set up Python + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: "3.x" + + # Install dependencies used for linting and unit tests + - name: Install dependencies + run: pip install -r requirements-dev.txt + + # Run python unit tests + - name: Run Unit Tests + run: pytest tests/unit + + # Run python lint + # qqqq on example used flake8 instead + # pyproject.toml will need configuring + - name: Run Linting + run: pylint src + + # qqqq to do run commit lint step and put in commit lint config + # see TELBlazor + - name: Commit lint + run: | + echo "Commit lint not implemented" + exit 1 + + # qqqq to do run version generation step and put in commit lint config + # see TELBlazor + - name: Version Generation Test Run + run: | + echo "Version test run not implemented" + exit 1 diff --git a/.github/workflows/dev.yml b/.github/workflows/dev.yml new file mode 100644 index 0000000..6a66a04 --- /dev/null +++ b/.github/workflows/dev.yml @@ -0,0 +1,16 @@ +# qqqq move notes somewhere useful +# qqqq i dont think we need a dev pipeline event because people are in their own dev workspace on databricks or a dev branch +# - but lets give it a go as a place for just data devs to update their stuff together maybe wont need it +# - think repeat any testing want to know early anyway?? +# - dont worry about repeating stepts between cicd for now keep simple for branch rules etc do it later if desired +# when are we deploying local dabs though? on push to feature branches???? +# -- linking it to push might be nice deploying dabs could be forgetable step for local development might be nice to auto deploy dab on push?? +name: Deploy to Databricks Dev Shared + +on: + push: + branches: + - dev-data-team-shared + +## Test, DAB, Tell dev repo folder on databricks to pull (so in sync with its dab ... or should it deploy dab but apparently git folders are for reference) +jobs: diff --git a/.github/workflows/prod-cd.yml b/.github/workflows/prod-cd.yml new file mode 100644 index 0000000..b825426 --- /dev/null +++ b/.github/workflows/prod-cd.yml @@ -0,0 +1,28 @@ +#No code quality checks staging has done it + +name: Deploy to Production Databricks Workspace + +on: + push: + branches: + - main + +jobs: + +deploy_prod: + name: "Deploy Bundle to Production Environment" + runs-on: ubuntu-latest + needs: testing + environment: prod + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + # qqqq add version and changelog creation step, and give a dab version matching repo version + - uses: actions/checkout@v4 + - uses: databricks/setup-cli@main + - name: Deploy bundle + run: databricks bundle deploy -t prod --auto-approve + working-directory: . \ No newline at end of file diff --git a/.github/workflows/staging-cicd.yml b/.github/workflows/staging-cicd.yml new file mode 100644 index 0000000..4848a3a --- /dev/null +++ b/.github/workflows/staging-cicd.yml @@ -0,0 +1,74 @@ +# Rely on ci.yml and branch rules to ensure bundle validation and linting +# we are not going from staging straight to prod, because we use staging for manual testing as well +# [use as a ref](https://github.com/evanaze/dbx-asset-bundle-deployment/blob/main/.github/workflows/dev-ci.yml) +name: Deploy to Databricks Staging and Trigger Tests in Databricks + +on: + push: + branches: + - staging #longer name for communication in the poc reduce when using it + - staging-data-test-team-shared + +jobs: + deploy_to_staging: + name: "Deploy Bundle to Stage Environment" + runs-on: ubuntu-latest + environment: staging + env: + DATABRICKS_HOST: ${{ vars.DBX_HOST }} + DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + ### maybe put in later + # - name: Set up Python + # uses: actions/setup-python@v4 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # pip install -r requirements.txt + # qqqq add a step to make a version number so can see its changed and print it as part of the job + #DAB_VERSION: "staging-${GITHUB_SHA::7}" # short commit hash + # databricks bundle deploy --environment staging --version $DAB_VERSION + + + - name: Set up Databricks CLI + uses: databricks/setup-cli@main + + - name: Validate Staging Bundle + # trigger target the staging deploy in databricks.yml + run: databricks bundle validate -t staging + + - name: Deploy Staging Bundle + # trigger target the staging deploy in databricks.yml + run: databricks bundle deploy -t staging #??--auto-approve + # working-directory: . + + # testing: + # name: "Integration Testing" + # runs-on: ubuntu-latest + # needs: deploy_to_staging + # environment: staging + # env: + # DATABRICKS_HOST: ${{ vars.DBX_HOST }} + # DATABRICKS_CLIENT_ID: ${{ vars.DBX_SP_ID }} + # DATABRICKS_CLIENT_SECRET: ${{ secrets.DBX_SP_SECRET }} + + # steps: + # - uses: actions/checkout@v4 + # - uses: actions/setup-python@v5 + # id: cache + # name: Cache Python build + # with: + # python-version: "3.x" + # cache-dependency-path: "requirements-dev.txt" + # cache: "pip" + # - name: Install dependencies + # run: pip install -r requirements-dev.txt + # - name: Integration Testing + # run: pytest tests/integration \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..840b1da --- /dev/null +++ b/.gitignore @@ -0,0 +1,47 @@ +# Couldnt find an official gitignore this is AI generated +# ----------------------------- +# Databricks / DAB / dbx +# ----------------------------- +.databricks/ # local workspace metadata / CLI files +.deploy/ # local deploy cache (dbx/DAB) +.bundle/ # local bundle files (dbx/DAB) +*.log # temporary logs +*.tmp # temporary files +dbx_project.yaml.bak # backup of bundle config +build/ +dist/ + +# ----------------------------- +# Python +# ----------------------------- +__pycache__/ +*.pyc +*.pyo +*.pyd +*.egg-info/ +.venv/ +env/ +pip-selfcheck.json + +# ----------------------------- +# Jupyter Notebooks +# ----------------------------- +.ipynb_checkpoints/ + +# ----------------------------- +# Scratch / experimental folder +# ----------------------------- +scratch/** # ignore all files in scratch +!scratch/README.md # except placeholder README.md + +# ----------------------------- +# IDE / editor +# ----------------------------- +.vscode/ +.idea/ + +# ----------------------------- +# OS / system +# ----------------------------- +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/README.md b/README.md index b9915b3..3f2f561 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,169 @@ # DatabricksPOC -Experimenting with databricks workflow +Experimenting with databricks workflow specifically DABS + +# Notable deviations from how final version will work +- Databricks instance per environment + - target the host instance not catalogs + - do not need root to be specified + - will replace yml hardcoding with variables + +# Branching +- Dev is like local so maybe PR and deployment to own space should not be gated + - so lint only + - auto merge + - unit tests + - no prs +- staging + - require PR + - run tests after merge +- main/prod + - rerun staging tests + - integration tests + - maybe some review but unlikley unless very motivated for checking for DRY early on +- dev is per user and there is a preference for working within databricks, so actually staging and main only seems the best approach. And versioning can be done via a commit tracker on pushing main + +- why folders named TD-xxx + - if its wip then in scratch? or gitignore scratch +- why not on own branches +- we need readme of structure with what is used for what + + +# Refs +- [Official up to step 5](https://docs.databricks.com/aws/en/ldp/convert-to-dab) +- [follow this too](https://www.evanazevedo.com/blog/databricks-deployment/) + - in this auto staging to prod which we dont want + - dev is shared not user space which we dont want + - our approach local development will be deploying via client in the ui for dev, but staging and prod is deployed by github actions instead +- also of use [multiple project](https://github.com/datakickstart/datakickstart_dabs/tree/main) +- [another with loads of examples to drill down to](https://github.com/databricks/bundle-examples) + +# Potential Structure +*Should generate one after* +[Confluence structure to compare to mine](https://hee-tis.atlassian.net/wiki/spaces/TP/pages/5201494023/GitHub+Structure) +project-root/ +│ +├── README.md +├── databricks.yml # Asset bundle config +├── notebooks/ # For exploratory work & polished pipelines +│ ├── dev/ # Analysts' playground +│ ├── pipelines/ # Production-ready notebooks +│ └── utils/ # Shared utility notebooks +│ +├── src/ # Core functions and transformations +│ ├── bronze/ +│ ├── silver/ +│ ├── gold/ +│ └── common/ # Reusable code (UDFs, helpers) +│ +├── tests/ +│ ├── unit/ +│ ├── integration/ +│ └── data_quality/ +│ +├── configs/ +│ ├── dev.yml +│ ├── staging.yml +│ └── prod.yml +│ +├── pipelines/ # Declarative pipeline definitions +│ ├── bronze_pipeline.py +│ ├── silver_pipeline.py +│ └── gold_pipeline.py +│ +├── requirements.txt # Python dependencies +├── environment.yml # Conda environment for analysts +└── scripts/ # Utility scripts (deploy, tests) + +# Notes on Structure + + +| 1st Level | 2nd Level | Notes | +|---------------|-----------------------|-------| +| README.md | — | | +| databricks.yml| — | | +| notebooks | dev | | +| notebooks | pipelines | | +| notebooks | utils | | +| src | bronze | | +| src | silver | | +| src | gold | | +| src | common | | +| tests | unit | | +| tests | integration | | +| tests | data_quality | | +| configs | dev.yml | | +| configs | staging.yml | | +| configs | prod.yml | | +| pipelines | bronze_pipeline.py | | +| pipelines | silver_pipeline.py | | +| pipelines | gold_pipeline.py | | +| requirements.txt | — | | +| environment.yml | — | | +| scripts | — | | + + +# Template read me + +# Workflow_POC + +The 'Workflow_POC' project was generated by using the default template. + +* `src/`: Python source code for this project. +* `resources/`: Resource configurations (jobs, pipelines, etc.) +* `tests/`: Unit tests for the shared Python code. +* `fixtures/`: Fixtures for data sets (primarily used for testing). + + +## Getting started + +Choose how you want to work on this project: + +(a) Directly in your Databricks workspace, see + https://docs.databricks.com/dev-tools/bundles/workspace. + +(b) Locally with an IDE like Cursor or VS Code, see + https://docs.databricks.com/dev-tools/vscode-ext.html. + +(c) With command line tools, see https://docs.databricks.com/dev-tools/cli/databricks-cli.html + +If you're developing with an IDE, dependencies for this project should be installed using uv: + +* Make sure you have the UV package manager installed. + It's an alternative to tools like pip: https://docs.astral.sh/uv/getting-started/installation/. +* Run `uv sync --dev` to install the project's dependencies. + + +# Using this project using the CLI + +The Databricks workspace and IDE extensions provide a graphical interface for working +with this project. It's also possible to interact with it directly using the CLI: + +1. Authenticate to your Databricks workspace, if you have not done so already: + ``` + $ databricks configure + ``` + +2. To deploy a development copy of this project, type: + ``` + $ databricks bundle deploy --target dev + ``` + (Note that "dev" is the default target, so the `--target` parameter + is optional here.) + + This deploys everything that's defined for this project. + +3. Similarly, to deploy a production copy, type: + ``` + $ databricks bundle deploy --target prod + ``` + +4. To run a job or pipeline, use the "run" command: + ``` + $ databricks bundle run + ``` + +5. Finally, to run tests locally, use `pytest`: + ``` + $ uv run pytest + ``` + diff --git a/databricks.yml b/databricks.yml new file mode 100644 index 0000000..b9a2ce5 --- /dev/null +++ b/databricks.yml @@ -0,0 +1,173 @@ + +######################################## +#### POC NOTES DELETE LATER ############ +######################################## +## Would have prefered use databrickscfg to hold host and sp names using custom values +## it can set profiles for built in values but not custom ones it seems +## # qqqq secrets C:\Users\NM2.W9215KB2\.databrickscfg will need for git too +## # seems doesnt do cutom ones want to do +## # [staging] +## #staging_host = -> in databricks yml host: ${workspace.staging_host} +## #staging_env_sp_name = +## #staging_storage_name = +####################################### +#### Differences expect for final varsion +###################################### +## storage account per environment +## different host for different databircks work spaces +## will load wheels via artifacts when we have them +## permission groups needed for admin of staging prod etc +####################################### +#### databrickscfg example with custom values didnt work we will only set the token in this file i think +######################################### +# [dev] +# host = https://adb-295718430158257.17.azuredatabricks.net +# token = dapi************************ +# dev_env_sp_name = b0326488-f6b0-4c0f-bf20-a091224bad83 +# dev_storage_name = unifiedrptdeltalake +# test_var = foundatestvar +######################################## +## Catalogs need to be unique because of the databricks metastore +## We will set the catalogs to only be accessible from their corresponding workspace +## that workspace will be linked to it own storage +## dev_catalogs will have unique schema names per user prefixed with their username so that there is seperation for experimentation +## staging_catalog and prod_catalog will have shared schema names +## schema names will not be defined here, a recommended pattern is to have them mirror layer names bronze, silver ... transformations, followed by their "domain" (may not be the right word) e.g. ods +## personal dabs should use personal auth but allow dev_user group view and dev_env_sp to manage it +## prod staging dab permissions should be via gitactions on pull request using service priniciple +## in future we should review permission on staging and prod as should be least privilege and done by service principles unless there is an issue +## we should have no user name based permissions +####################################### +## ref [Evanaze example repo](https://github.com/evanaze/dbx-asset-bundle-deployment) +######################################## +#### END POC NOTES DELETE LATER ########## +######################################## + +###################################### +## Databricks Asset Bundle Definition +###################################### +## ... + +###################################### + +bundle: + name: DatabricksPOC + uuid: ba682f8a-5d14-4ae7-a770-d6a359c0e835 + +# Importing resources. These are split into their own files for modularity +# qqqq comback may be a key place for dry +include: + - resources/*.yml + - resources/*/*.yml + +# Variable declarations. These variables are assigned in the dev/prod targets below. +# Then made available to python scriplts via spark.conf.get("bundle.") once exposed by the yml running that file +variables: + catalog: + description: Envirnoment specific catalog name + # schema: + # default: qqqq-deleted-later + # description: Actually setting this with vars but included due to an error + schema_prefix: + default: "" # to be used in dev only so staging prod have no prefix value + description: To prefix dev user schema names to allow multiple dev users to use same catalog with their own different schema names for development + env_name: + description: The environment name (dev, staging, prod) + storage_account: + description: Seperate databricks workspaces dedicated storage (dev, staging, prod) + # Storage principles id is used for name field in databricks.yml name the sp variable: _id + # be nice to hide these in the cfg, github has these in vars anyway not seen this done tried cfg file and not tried a config.yml [example proj with {{ dev_sp }}](https://github.com/evanaze/dbx-asset-bundle-deployment/blob/main/databricks.yml) ... not seeing databricks bundle deploy -t staging --var dev_sp= being specified in the cicd + dev_env_sp_id: + default: "b0326488-f6b0-4c0f-bf20-a091224bad83" + staging_env_sp_id: + default: "d588f2c8-0c0a-4ded-9da2-0663bf8dd994" + prod_env_sp_id: + default: "my-sp-id-jfsdkjhfjsdhfkjh" + +# qqqq will want later if many python files +# artifacts: + # python_artifact: + # type: whl + # build: uv build --wheel + +# Deployment environments +targets: + dev: + # qqqq what is the process of developing and deploying dabs for own area. + # qqqq would it actually be staging with features against folders !!!!??? + + mode: development + # Deploy under own user not service principle + default: true + # This is the the default deployment workspace + workspace: + # profile means in gets set values from [dev] section of databrickscfg + profile: dev + # specify hosts in the databricks.yml not config for clarity + # dev databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + # Dev root path under each users home directory + root_path: /Users/${workspace.current_user.userName}/.bundle/${bundle.name}/dev + variables: + env_name: dev + catalog: dev_catalog + # only dev has schema prefix as its per user + #e.g. phil_ will become phil_bronze_ods if we do layer and domain? as our schema naming convention + schema_prefix: ${workspace.current_user.short_name}_ + # dev storage account + storage_account: unifiedrptdeltalake + permissions: + - level: CAN_MANAGE + service_principal_name: "${var.dev_env_sp_id}" + - level: CAN_MANAGE + # Devs manage our own stuff + user_name: ${workspace.current_user.userName} + - level: CAN_VIEW + # Devs can see each others stuff + group_name: dev_env_users + + + + staging: + # Staging should purely be for investigation and testing prior to going to prod it runs in production mode so will run and constant updates and should be through github actions and service principle on successful pull request + mode: production + workspace: + profile: staging + # staging databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + root_path: /Workspace/.bundle/${bundle.name}/staging + variables: + env_name: staging + catalog: staging_catalog + # Staging storage account + storage_account: unifiedrptdeltalake + permissions: + - user_name: philip.tate@nhs.net + level: CAN_MANAGE + - group_name: staging_env_users + level: CAN_VIEW + - service_principal_name: "${var.staging_env_sp_id}" + level: CAN_MANAGE + + + prod: + # Automatically deployed to via git actions and service principle minimal testing on deploy as previously run on staging + mode: production + workspace: + profile: prod + # prod databricks host + host: https://adb-295718430158257.17.azuredatabricks.net + root_path: /Workspace/.bundle/${bundle.name}/prod + variables: + # when 3 databricks it will be a share catalog name across the databricks + env_name: prod + catalog: prod_catalog + # Prod storage account + storage_account: unifiedrptdeltalake + permissions: + - user_name: philip.tate@nhs.net + level: CAN_MANAGE + - group_name: prod_env_users + level: CAN_VIEW + - service_principal_name: "${var.prod_env_sp_id}" + level: CAN_MANAGE diff --git a/devops/user dev dab deploy.ipynb b/devops/user dev dab deploy.ipynb new file mode 100644 index 0000000..f9229c8 --- /dev/null +++ b/devops/user dev dab deploy.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "dd35ddc1-acdf-428d-8fd1-0abaf308ceb0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "import subprocess\n", + "\n", + "\n", + "print(\"\\nStep 2: Validate DAB\")\n", + "subprocess.run([\"databricks\", \"bundle\", \"validate\"])\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "mostRecentlyExecutedCommandWithImplicitDF": { + "commandId": -1, + "dataframes": [ + "_sqldf" + ] + }, + "pythonIndentUnit": 4 + }, + "notebookName": "user dev dab deploy", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..e69de29 diff --git a/docs/deployment_guide.md b/docs/deployment_guide.md new file mode 100644 index 0000000..e69de29 diff --git a/environment-redundant use toml.yml b/environment-redundant use toml.yml new file mode 100644 index 0000000..e69de29 diff --git a/fixtures/.gitkeep b/fixtures/.gitkeep new file mode 100644 index 0000000..6cabba3 --- /dev/null +++ b/fixtures/.gitkeep @@ -0,0 +1,9 @@ +# Test fixtures directory + +Add JSON or CSV files here. In tests, use them with `load_fixture()`: + +``` +def test_using_fixture(load_fixture): + data = load_fixture("my_data.json") + assert len(data) >= 1 +``` diff --git a/notebooks/.gitinclude b/notebooks/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..1707b8a --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[project] +name = "Workflow_POC" +version = "0.0.1" +authors = [{ name = "philip.tate@nhs.net" }] +requires-python = ">=3.10,<=3.13" +dependencies = [ + # Any dependencies for jobs and pipelines in this project can be added here + # See also https://docs.databricks.com/dev-tools/bundles/library-dependencies + # + # LIMITATION: for pipelines, dependencies are cached during development; + # add dependencies to the 'environment' section of your pipeline.yml file instead +] + +[dependency-groups] +dev = [ + "pytest", + "databricks-dlt", + "databricks-connect>=15.4,<15.5", +] + +[project.scripts] +main = "Workflow_POC.main:main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["src"] + +[tool.black] +line-length = 125 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..e69de29 diff --git a/resources/configs/.gitinclude b/resources/configs/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/resources/configs/dev.yml b/resources/configs/dev.yml new file mode 100644 index 0000000..fd3aaa4 --- /dev/null +++ b/resources/configs/dev.yml @@ -0,0 +1,13 @@ +# TODO and gitignore it and only need the one relevant to my databricks workspace +# something like + +# vars: + # # Unity Catalog + # catalog: dev_catalog # temporary dev catalog name + # schema: unified_reporting_dev # schema for bronze/silver/gold + + # # Root path for data (can be DBFS or ADLS mount) + # data_root: /mnt/dev/unified_reporting + + # # Mode flag for your code + # mode: development \ No newline at end of file diff --git a/resources/configs/prod.yml b/resources/configs/prod.yml new file mode 100644 index 0000000..e69de29 diff --git a/resources/configs/staging.yml b/resources/configs/staging.yml new file mode 100644 index 0000000..e69de29 diff --git a/resources/jobs/create_db_connections.dontneedityml b/resources/jobs/create_db_connections.dontneedityml new file mode 100644 index 0000000..8832c4e --- /dev/null +++ b/resources/jobs/create_db_connections.dontneedityml @@ -0,0 +1,14 @@ +resources: + jobs: + CreateDatabaseConnections_job: + name: CreateDatabaseConnections_job + description: "Sets up database connections and foreign catalogs required for pipelines" + email_notifications: + on_failure: + - philip.tate@nhs.net + tasks: + - task_key: create_db_connections + notebook_task: + notebook_path: ingestion/CreateDatabaseConnections.ipynb + base_parameters: + environment: ${var.env_name} diff --git a/resources/pipeline/.gitinclude b/resources/pipeline/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/resources/pipeline/ods_ingestion.yml b/resources/pipeline/ods_ingestion.yml new file mode 100644 index 0000000..2685775 --- /dev/null +++ b/resources/pipeline/ods_ingestion.yml @@ -0,0 +1,86 @@ +############################### +## POC notes - DELETE LATER +############################### +## We should think about these resource files I think potentially a .yml per layer bronze.yml may make sense +## We will not define schemas here +## We use this file to expose from databricks.yml the variables we need to set up the pipeline +## We will define too variables just for the set of pipelines here too if we start running layer based .ymls then we would have layer level variables here +############################### +## If we want specific pipeline resource file per .py file we should use this i think + # libraries: + # - notebook: + # path: ../../src/ingestion/ods_ingest.py +## if we want per layer maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/**.py +## if we want per domain maybe + # libraries: + # - glob: + # # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ods_*.py +############################### + +# qqqq discus where want these things to live if it was using a wheel then the python file could be literally a table and a foreach +##### +# If we are running multlipe pipelines we may define all their vars at the top +##### + + +# qqqq +## im thinking var for in script var <-- also no because i cand get bundle.xyz and no all vars seem accessible everywhere i get catalog from databricks.yml +## bundle for vars originating from databricks.ymly +### i get vars from databricks +## pipeline. from pipeline files +## but files run, it shouldnt be bundle and pipeline it should represent the scope they come from + +## qqqq i like the top level config value to pass i do not like construction vars in a yml instead of python but +# Error: cannot create pipeline: The target schema field is required for UC pipelines. Reason: DLT requires specifying a target schema for UC pipelines. Please use the TEMPORARY keyword in the CREATE MATERIALIZED VIEW or CREATE STREAMING TABLE statement if you do not wish to publish your dataset.. +# Error: cannot update pipeline: Specified 'schema' field in the pipeline settings is illegal. Reason: Cannot unset 'schema' field once it's defined in the pipeline spec. Please create a new DLT pipeline. For more information about publishing modes, see https://docs.databricks.com/en/dlt/migrate-to-dpm.html. +variables: + layer: + default: bronze + description: bronze, silver, transfrormations etc + + +x-bronze-config: &bronze-config + bundle.env_name: ${var.env_name} + bundle.storage_account: ${var.storage_account} #storage is environment specific so defined in databricks.yml + pipeline.layer: ${var.layer} # if we are doing layer based resource files qqqq get from var + # f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/ -> py adds {folder_name}/" + pipeline.storage_container_path: "abfss://${var.layer}@${var.storage_account}.dfs.core.windows.net/" + +resources: + pipelines: + pipeline_ods_ingestion: + name: ods_ingestion + libraries: + - glob: + # if doing a pipeline per layer would do something like + # include: ../../src/ingestion/ - might work + # include: ../../src/ingestion/*.py - doesnt work + include: ../../src/ingestion/ods_ingest.py + photon: true + # qqqq good practice to specify its something to do with dlt having beta version? + channel: current + # By defining catalog here we set it for all jobs in the pipeline without needing to specify it witht he variable when defining a table + catalog: ${var.catalog} + target: ${var.schema_prefix}${var.layer}_ods ## AI said missing this qqqq i dont want this hard coded here + serverless: true + # qqqq dont think i need this here DELETE root_path: ../../src/ingestion + # qqqq config is only at pipeline level use yml anchor points if need to reuse + configuration: + ################ Map Databricks Bundle variables to Spark Config Properties ################ + # Map the Bundle variables (from databricks.yml) to Spark config properties + # The key names here MUST match what you use in spark.conf.get() in Python! + # bundle.env_name: ${var.env_name} + # bundle.schema_prefix: ${var.schema_prefix} - qqqq setting schema now in the yml + # bundle.storage_account: ${var.storage_account} + ############### Resource yml files for set of pipelines ################# + # If we do bronze, silver ... tranformation based layers with own yml files will define layer level vars here + # for example this would be + # bundle.layer_name: bronze -> #schema_layer = "bronze_" -> # schema_layer = park.conf.get("bundle.layer_name") + # configuration: + <<: *bronze-config #config anchor point for bronze layer so all pipelines in this file will have this set of configs + pipeline.domain: ods # if we then want to apply per pipeline variable here \ No newline at end of file diff --git a/resources/setup/unity-catalog-try-later.yml b/resources/setup/unity-catalog-try-later.yml new file mode 100644 index 0000000..5737fd5 --- /dev/null +++ b/resources/setup/unity-catalog-try-later.yml @@ -0,0 +1,46 @@ + +# rather than recording manual run scripts in the the setup top level folder we could check and run like this potentially +# this quick was just a suggested improvement and has not been checked just put here for later +# qqqq + + +# # resources/unity-catalog.yml +# resources: +# # --- 1. CATALOGS --- +# unity_catalog_catalogs: +# # Defining the catalog for the DEV environment +# our_catalog: +# name: ${var.catalog} # Will resolve to 'our_catalog' in dev target +# # You can optionally define the managed location here if you want to hardcode it, +# # but often this is best managed by the workspace admin. +# # managed_location: 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/our_catalog' +# comment: 'Dev environment - personal schemas per developer' + +# # The staging and prod targets will use their respective catalog names (staging_catalog/prod_catalog) +# # They can reference the same general structure here, or be defined separately if they have different configs. +# staging_catalog: +# name: staging_catalog +# comment: 'Staging environment - integration testing' + +# prod_catalog: +# name: prod_catalog +# comment: 'Production environment - live data' + +# # --- 2. SCHEMAS (for Staging and Prod) --- +# unity_catalog_schemas: +# staging_schema: +# # Use explicit names for Staging/Prod as they are shared/static +# schema: staging_catalog.our_schema +# catalog: staging_catalog +# comment: 'Shared schema for Staging data' + +# prod_schema: +# schema: prod_catalog.our_schema +# catalog: prod_catalog +# comment: 'Shared schema for Production data' + +# # NOTE on DEV Schema: +# # You DO NOT need to explicitly define the DEV schema here (e.g., our_catalog.${workspace.current_user.short_name}). +# # DLT pipelines, when configured with `schema: ${var.schema}` and `catalog: ${var.catalog}`, +# # will automatically create the *schema* (database) if it doesn't exist, as long as the +# # user/service principal has the necessary CREATE SCHEMA permission on the target catalog. \ No newline at end of file diff --git a/scripts/.gitinclude b/scripts/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/setup/Readme.md b/setup/Readme.md new file mode 100644 index 0000000..b62226e --- /dev/null +++ b/setup/Readme.md @@ -0,0 +1 @@ +Manually run files, but kept here for a history or for future setup \ No newline at end of file diff --git a/setup/setup_catalogs.sql b/setup/setup_catalogs.sql new file mode 100644 index 0000000..d44795d --- /dev/null +++ b/setup/setup_catalogs.sql @@ -0,0 +1,40 @@ +-- ============================================================================ +-- ONE-TIME SETUP: Create catalogs and schemas +-- ============================================================================ +-- Run this in Databricks SQL Editor when setting up a new environment +-- +-- POC Setup (current): +-- - Creates 3 separate catalogs in single Databricks instance +-- - our_catalog: Dev work with personal schemas per developer +-- - staging_catalog: Staging with shared schema +-- - prod_catalog: Production with shared schema +-- +-- Production Setup (future with 3 Databricks instances): +-- - Create same catalog name on each instance +-- - Separation by different hosts, not catalog names +-- ============================================================================ + + +-- qqqq Careful this applied it to all databricks +-- there is this +-- ISOLATION MODE ISOLATED -- this line is the magic + -- COMMENT 'POC-only – never visible outside this workspace'; + +-- Create catalogs using managed storage (Databricks handles storage automatically) +CREATE CATALOG IF NOT EXISTS our_catalog +MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/our_catalog' +COMMENT 'Dev environment - personal schemas per developer'; + +CREATE CATALOG IF NOT EXISTS staging_catalog +MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/staging_catalog' +COMMENT 'Staging environment - integration testing'; + +CREATE CATALOG IF NOT EXISTS prod_catalog +MANAGED LOCATION 'abfss://unity-catalog-storage@dbstoragenxhpv6mlq64wq.dfs.core.windows.net/295718430158257/prod_catalog' +COMMENT 'Production environment - live data'; + +-- Create shared schemas for staging and prod +-- Dev schemas are created automatically per-user by the bundle deployment +-- (via schema: ${workspace.current_user.short_name} in databricks.yml) +CREATE SCHEMA IF NOT EXISTS staging_catalog.our_schema; +CREATE SCHEMA IF NOT EXISTS prod_catalog.our_schema; \ No newline at end of file diff --git a/src/.gitinclude b/src/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/bronze/- Copy.gitinclude b/src/bronze/- Copy.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/dlt_pipeline.py b/src/dlt_pipeline.py new file mode 100644 index 0000000..93802c1 --- /dev/null +++ b/src/dlt_pipeline.py @@ -0,0 +1 @@ +# entry point DLT pipelines \ No newline at end of file diff --git a/src/gold/- Copy.gitinclude b/src/gold/- Copy.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/ingestion/CreateDatabaseConnections.dontneedipynb b/src/ingestion/CreateDatabaseConnections.dontneedipynb new file mode 100644 index 0000000..d9142a7 --- /dev/null +++ b/src/ingestion/CreateDatabaseConnections.dontneedipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a98c9296-e636-4ff7-a2ea-3e49feacf9c2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "moodlelearninghub" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "d76cd8c9-2608-4449-b0d0-3450fcc2910b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql\n", + "set scope = {{:scope}}\n", + "\n", + "-- CREATE CONNECTION IF NOT EXISTS moodlelearninghub\n", + "CREATE CONNECTION IF NOT EXISTS moodlelearninghub\n", + " TYPE SQLSERVER\n", + " OPTIONS (\n", + " host secret(scope, 'LearningHubMoodle.host'),\n", + " port secret(scope, 'LearningHubMoodle.port'),\n", + " user secret(scope, 'LearningHubMoodle.username'),\n", + " password secret(scope, 'LearningHubMoodle.password')\n", + " );\n", + "\n", + "-- CREATE CATALOG moodle_learning_hub_catalogue\n", + "CREATE FOREIGN CATALOG IF NOT EXISTS moodle_learning_hub_catalogue USING CONNECTION moodlelearninghub\n", + " OPTIONS (\n", + " database secret(scope, 'LearningHubMoodle.database')\n", + " );" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7ff9f1ef-8c02-485b-af94-ae74eda44a9b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "learninghub" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6826dd70-4a3a-4c92-9b70-e43dbca12313", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "%sql\n", + "-- CREATE CONNECTION IF NOT EXISTS learninghub\n", + "CREATE CONNECTION IF NOT EXISTS learninghub\n", + " TYPE SQLSERVER\n", + " OPTIONS (\n", + " host secret(scope, 'LearningHub.host'),\n", + " port secret(scope, 'LearningHub.port'),\n", + " user secret(scope, 'LearningHub.username'),\n", + " password secret(scope, 'LearningHub.password')\n", + " ); \n", + "\n", + "-- CREATE CATALOG learninghub_catalog\n", + "CREATE FOREIGN CATALOG IF NOT EXISTS learninghub_catalog USING CONNECTION learninghub\n", + " OPTIONS (\n", + " database secret(scope, 'LearningHub.database')\n", + " );" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "environment_version": "4" + }, + "inputWidgetPreferences": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "CreateDatabaseConnections", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/src/ingestion/ods_ingest - original for reference delete later.py b/src/ingestion/ods_ingest - original for reference delete later.py new file mode 100644 index 0000000..c7bf5d1 --- /dev/null +++ b/src/ingestion/ods_ingest - original for reference delete later.py @@ -0,0 +1,145 @@ +# from pyspark import pipelines as dp + +# container = 'bronze' +# storage_account = 'unifiedrptdeltalake' +# path_to_file_or_folder = 'ods' + +# @dp.table( +# name="bronze_ods.Additional_Attributes_Details", +# comment="Import raw Additional_Attributes_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Additional_Attributes_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Code_System_Details", +# comment="Import raw Code_System_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Code_System_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Contact_Details", +# comment="Import raw Contact_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Contact_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Manifest_Details", +# comment="Import raw Manifest_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Manifest_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Organisation_Details", +# comment="Import raw Organisation_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Organisation_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.OtherID_Details", +# comment="Import raw OtherID_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/OtherID_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.PrimaryRole_Details", +# comment="Import raw PrimaryRole_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/PrimaryRole_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Relationship_Details", +# comment="Import raw Relationship_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Relationship_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Role_Details", +# comment="Import raw Role_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Role_Details.csv" +# ) +# ) + +# @dp.table( +# name="bronze_ods.Successor_Details", +# comment="Import raw Successor_Details" +# ) +# def azure_csv_table(): +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load( +# f"abfss://{container}@{storage_account}.dfs.core.windows.net/{path_to_file_or_folder}/Successor_Details.csv" +# ) +# ) \ No newline at end of file diff --git a/src/ingestion/ods_ingest.py b/src/ingestion/ods_ingest.py new file mode 100644 index 0000000..9812f39 --- /dev/null +++ b/src/ingestion/ods_ingest.py @@ -0,0 +1,247 @@ +# for making spark tables +# qqqq problem i am having is that we are setting the schema, and dev has schema set as user names +# i want to use the databricks.yml schema name for dev, and for staging and prod i want to set it to bronze_ods in this script +from pyspark import pipelines as dp + + +# Fixed System Constants these and some of this stuff should be going in a helper i think +#ADLS_PROTOCOL = "abfss://" +#ADLS_SUFFIX = ".dfs.core.windows.net" + +# 1. Get the Catalog name +# qqqq i dont think i want a default id prefer an error i think +# if set this in pipeline yml we wont need it +#catalog_name = spark.conf.get("bundle.catalog") + +# 2. Get the Schema Prefix (This is what changes between environments) +# In Dev, this will be the username. In Staging/Prod, it will be blank. +#schema_user_prefix = spark.conf.get("bundle.schema_prefix") +# this will often be a medallion layer but in src we also have transformations and ingestion so i think this mirror folders in source would be logical if team agrees qqqq +#schema_layer = "bronze_" +#schema_domain = "ods" #qqqq check what terminiology we want here + +# Construct the final schema name +#schema_name = (schema_user_prefix + schema_layer + schema_domain) +#print(schema_name) +# The container likely should mirror the layer name? +# container_layer ?? qqqq +#container = spark.conf.get("bundle.layer") # layer is bronze silver etc +# This likely should be dev staging prod +# storage_environment ?? qqqq +# wouldnt have default +# storage_account = spark.conf.get("bundle.storage_account") # 'unifiedrptdeltalake' +storage_container_path = spark.conf.get("pipeline.storage_container_path") +# In our storage our folders maybe should be domain based and if we thing this is manageable as hard rule this variable could be called domain_folder or similar qqqq +# domain_folder ?? qqqq +folder_name = spark.conf.get("pipeline.domain") # ods +#folder_location_path = f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/{folder_name}/" +folder_location_path = f"{storage_container_path }/{folder_name}/" +# "abfss://bronze@unifiedrptdeltalake.dfs.core.windows.net/ods +print(folder_location_path) + + +@dp.table( + # qqqq was f"{schema_name}.Additional_Attributes_Details" but worked before now need to do it this way???!!! + name="Additional_Attributes_Details", + comment="Import raw Additional_Attributes_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Additional_Attributes_Details.csv" + ) + ) + +@dp.table( + name="Code_System_Details", + comment="Import raw Code_System_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Code_System_Details.csv" + ) + ) + +@dp.table( + name="Contact_Details", + comment="Import raw Contact_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Contact_Details.csv" + ) + ) + +@dp.table( + name="Manifest_Details", + comment="Import raw Manifest_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Manifest_Details.csv" + ) + ) + +@dp.table( + name="Organisation_Details", + comment="Import raw Organisation_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Organisation_Details.csv" + ) + ) + +@dp.table( + name="OtherID_Details", + comment="Import raw OtherID_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}OtherID_Details.csv" + ) + ) + +@dp.table( + name="PrimaryRole_Details", + comment="Import raw PrimaryRole_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}PrimaryRole_Details.csv" + ) + ) + +@dp.table( + name="Relationship_Details", + comment="Import raw Relationship_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Relationship_Details.csv" + ) + ) + +@dp.table( + name="Role_Details", + comment="Import raw Role_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Role_Details.csv" + ) + ) + +@dp.table( + name="Successor_Details", + comment="Import raw Successor_Details" +) +def azure_csv_table(): + return ( + spark.read.format("csv") + .option("header", "true") + .option("inferSchema", "true") + .load( + f"{folder_location_path}Successor_Details.csv" + ) + ) + + +###### Try this +# import dlt +# from pyspark.sql import SparkSession + +# # ============================================================================ +# # Configuration from Bundle +# # ============================================================================ +# storage_account = spark.conf.get("bundle.storage_account") +# layer = spark.conf.get("bundle.layer") # "bronze" +# domain = spark.conf.get("bundle.domain") # "ods" + +# # ============================================================================ +# # Constants +# # ============================================================================ +# ADLS_PROTOCOL = "abfss://" +# ADLS_SUFFIX = ".dfs.core.windows.net" + +# # ============================================================================ +# # Derived Paths +# # ============================================================================ +# container = layer # Container matches layer +# folder_path = f"{ADLS_PROTOCOL}{container}@{storage_account}{ADLS_SUFFIX}/{domain}/" + +# # ============================================================================ +# # Data Model - ODS Tables +# # ============================================================================ +# ODS_TABLES = [ +# ("Additional_Attributes_Details", "Import raw Additional_Attributes_Details"), +# ("Code_System_Details", "Import raw Code_System_Details"), +# ("Contact_Details", "Import raw Contact_Details"), +# ("Manifest_Details", "Import raw Manifest_Details"), +# ("Organisation_Details", "Import raw Organisation_Details"), +# ("OtherID_Details", "Import raw OtherID_Details"), +# ("PrimaryRole_Details", "Import raw PrimaryRole_Details"), +# ("Relationship_Details", "Import raw Relationship_Details"), +# ("Role_Details", "Import raw Role_Details"), +# ("Successor_Details", "Import raw Successor_Details"), +# ] + +# # ============================================================================ +# # Helper Functions <----- this is what would go in a wheel probably +# # ============================================================================ +# def load_csv(filename: str): +# """Load CSV from Azure storage with standard options""" +# return ( +# spark.read.format("csv") +# .option("header", "true") +# .option("inferSchema", "true") +# .load(f"{folder_path}{filename}.csv") +# ) + +# # ============================================================================ +# # Create DLT Tables +# # ============================================================================ +# for table_name, comment in ODS_TABLES: +# # Closure to capture loop variables correctly +# def create_table(name=table_name, desc=comment): +# @dlt.table(name=name, comment=desc) +# def table_loader(): +# return load_csv(name) +# return table_loader + +# create_table() \ No newline at end of file diff --git a/src/silver/- Copy.gitinclude b/src/silver/- Copy.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/transformations/.gitinclude b/src/transformations/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/.gitinclude b/src/utils/.gitinclude new file mode 100644 index 0000000..e69de29 diff --git a/tests/.gitinclude b/tests/.gitinclude new file mode 100644 index 0000000..e69de29