atc-net · LauJohansson · Nov 23, 2022 · Nov 23, 2022 · Nov 23, 2022 · Nov 23, 2022
diff --git a/.github/submit/Set-DatabricksCfgManual.ps1 b/.github/submit/Set-DatabricksCfgManual.ps1
@@ -0,0 +1,18 @@
+param (
+  # This helper function can set the databricks cfg manually
+  [Parameter(Mandatory=$True)]
+  [ValidateNotNullOrEmpty()]
+  [string]
+  $workspaceUrl,
+
+  [Parameter(Mandatory=$True)]
+  [ValidateNotNullOrEmpty()]
+  [string]
+  $token
+)
+
+
+Set-Content ~/.databrickscfg "[DEFAULT]"
+Add-Content ~/.databrickscfg "host = https://$workspaceUrl"
+Add-Content ~/.databrickscfg "token = $token"
+Add-Content ~/.databrickscfg ""
diff --git a/src/atc/delta/__init__.py b/src/atc/delta/__init__.py
@@ -1,2 +1,4 @@
+from .autoloaderstream_handle import AutoloaderStreamHandle  # noqa: F401
 from .db_handle import DbHandle  # noqa: F401
 from .delta_handle import DeltaHandle  # noqa: F401
+from .deltastream_handle import DeltaStreamHandle  # noqa: F401
diff --git a/src/atc/delta/autoloaderstream_handle.py b/src/atc/delta/autoloaderstream_handle.py
@@ -0,0 +1,74 @@
+from pyspark.sql import DataFrame
+
+from atc.configurator.configurator import Configurator
+from atc.spark import Spark
+from atc.tables import TableHandle
+from atc.tables.SparkHandle import DeltaHandleInvalidFormat
+
+
+class AutoloaderStreamHandle(TableHandle):
+    def __init__(
+        self,
+        *,
+        location: str,
+        checkpoint_path: str,
+        data_format: str,
+    ):
+        """
+        location: the location of the delta table
+
+        checkpoint_path: The location of the checkpoints, <table_name>/_checkpoints
+            The Delta Lake VACUUM function removes all files not managed by Delta Lake
+            but skips any directories that begin with _. You can safely store
+            checkpoints alongside other data and metadata for a Delta table
+            using a directory structure such as <table_name>/_checkpoints
+            See: https://docs.databricks.com/structured-streaming/delta-lake.html
+
+        data_format: the data format of the files that are read
+
+        """
+
+        assert (
+            Spark.version() >= Spark.DATABRICKS_RUNTIME_10_4
+        ), f"AutoloaderStreamHandle not available for Spark version {Spark.version()}"
+
+        self._location = location
+        self._data_format = data_format
+        self._checkpoint_path = checkpoint_path
+
+        self._validate()
+        self._validate_checkpoint()
+
+    @classmethod
+    def from_tc(cls, id: str) -> "AutoloaderStreamHandle":
+        tc = Configurator()
+        return cls(
+            location=tc.table_property(id, "path", None),
+            data_format=tc.table_property(id, "format", None),
+            checkpoint_path=tc.table_property(id, "checkpoint_path", None),
+        )
+
+    def _validate(self):
+        """Validates that the name is either db.table or just table."""
+        if self._data_format == "delta":
+            raise DeltaHandleInvalidFormat("Use DeltaStreamHandle for delta.")
+
+    def _validate_checkpoint(self):
+        if "/_" not in self._checkpoint_path:
+            print(
+                "RECOMMENDATION: You can safely store checkpoints alongside "
+                "other data and metadata for a Delta table using a directory "
+                "structure such as <table_name>/_checkpoints"
+            )
+
+    def read(self) -> DataFrame:
+
+        reader = (
+            Spark.get()
+            .readStream.format("cloudFiles")
+            .option("cloudFiles.format", self._data_format)
+            .option("cloudFiles.schemaLocation", self._checkpoint_path)
+            .load(self._location)
+        )
+
+        return reader
diff --git a/src/atc/delta/delta_handle.py b/src/atc/delta/delta_handle.py
@@ -1,35 +1,19 @@
-from typing import List, Optional, Union
+from typing import List, Union
 
 from pyspark.sql import DataFrame
 
 from atc.configurator.configurator import Configurator
-from atc.exceptions import AtcException
 from atc.functions import get_unique_tempview_name, init_dbutils
 from atc.spark import Spark
-from atc.tables.TableHandle import TableHandle
+from atc.tables.SparkHandle import SparkHandle
 from atc.utils.CheckDfMerge import CheckDfMerge
 from atc.utils.GetMergeStatement import GetMergeStatement
 
 
-class DeltaHandleException(AtcException):
-    pass
-
-
-class DeltaHandleInvalidName(DeltaHandleException):
-    pass
-
-
-class DeltaHandleInvalidFormat(DeltaHandleException):
-    pass
-
-
-class DeltaHandle(TableHandle):
+class DeltaHandle(SparkHandle):
     def __init__(self, name: str, location: str = None, data_format: str = "delta"):
-        self._name = name
-        self._location = location
-        self._data_format = data_format
 
-        self._partitioning: Optional[List[str]] = None
+        super().__init__(name, location, data_format)
 
         self._validate()
 
@@ -42,29 +26,6 @@ def from_tc(cls, id: str) -> "DeltaHandle":
             data_format=tc.table_property(id, "format", "delta"),
         )
 
-    def _validate(self):
-        """Validates that the name is either db.table or just table."""
-        if not self._name:
-            if not self._location:
-                raise DeltaHandleInvalidName(
-                    "Cannot create DeltaHandle without name or path"
-                )
-            self._name = f"delta.`{self._location}`"
-        else:
-            name_parts = self._name.split(".")
-            if len(name_parts) == 1:
-                self._db = None
-                self._table_name = name_parts[0]
-            elif len(name_parts) == 2:
-                self._db = name_parts[0]
-                self._table_name = name_parts[1]
-            else:
-                raise DeltaHandleInvalidName(f"Could not parse name {self._name}")
-
-        # only format delta is supported.
-        if self._data_format != "delta":
-            raise DeltaHandleInvalidFormat("Only format delta is supported.")
-
     def read(self) -> DataFrame:
         """Read table by path if location is given, otherwise from name."""
         if self._location:
@@ -102,60 +63,6 @@ def drop_and_delete(self) -> None:
         if self._location:
             init_dbutils().fs.rm(self._location, True)
 
-    def create_hive_table(self) -> None:
-        sql = f"CREATE TABLE IF NOT EXISTS {self._name} "
-        if self._location:
-            sql += f" USING DELTA LOCATION '{self._location}'"
-        Spark.get().sql(sql)
-
-    def recreate_hive_table(self):
-        self.drop()
-        self.create_hive_table()
-
-    def get_partitioning(self):
-        """The result of DESCRIBE TABLE tablename is like this:
-        +-----------------+---------------+-------+
-        |         col_name|      data_type|comment|
-        +-----------------+---------------+-------+
-        |           mycolA|         string|       |
-        |           myColB|            int|       |
-        |                 |               |       |
-        |   # Partitioning|               |       |
-        |           Part 0|         mycolA|       |
-        +-----------------+---------------+-------+
-        but this method return the partitioning in the form ['mycolA'],
-        if there is no partitioning, an empty list is returned.
-        """
-        if self._partitioning is None:
-            # create an iterator object and use it in two steps
-            rows_iter = iter(
-                Spark.get().sql(f"DESCRIBE TABLE {self.get_tablename()}").collect()
-            )
-
-            # roll over the iterator until you see the title line
-            for row in rows_iter:
-                # discard rows until the important section header
-                if row.col_name.strip() == "# Partitioning":
-                    break
-            # at this point, the iterator has moved past the section heading
-            # leaving only the rows with "Part 1" etc.
-
-            # create a list from the rest of the iterator like [(0,colA), (1,colB)]
-            parts = [
-                (int(row.col_name[5:]), row.data_type)
-                for row in rows_iter
-                if row.col_name.startswith("Part ")
-            ]
-            # sort, just in case the parts were out of order.
-            parts.sort()
-
-            # discard the index and put into an ordered list.
-            self._partitioning = [p[1] for p in parts]
-        return self._partitioning
-
-    def get_tablename(self) -> str:
-        return self._name
-
     def upsert(
         self,
         df: DataFrame,