Addition of kurt function

hablapps · Jan 15, 2024 · 99bdb1b · 99bdb1b
1 parent d28e93a
commit 99bdb1b
Show file tree

Hide file tree

Showing 3 changed files with 349 additions and 0 deletions.
diff --git a/docs/user-guide/advanced/Pandas_API.ipynb b/docs/user-guide/advanced/Pandas_API.ipynb
@@ -436,6 +436,255 @@
     "tab.mean(axis=1)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "fe565b65-fbf2-47ba-a26e-791d09fd4f55",
+   "metadata": {},
+   "source": [
+    "### Table.kurt()\n",
+    "\n",
+    "```\n",
+    "Table.kurt(axis=0, skipna=True, numeric_only=False)\n",
+    "```\n",
+    "\n",
+    "Return unbiased kurtosis over requested axis. Kurtosis obtained using Fisher’s definition of kurtosis (kurtosis of normal == 0.0). Normalized by N-1.\n",
+    "\n",
+    "\n",
+    "**Parameters:**\n",
+    "\n",
+    "| Name         | Type | Description                                                                      | Default |\n",
+    "| :----------: | :--: | :------------------------------------------------------------------------------- | :-----: |\n",
+    "| axis         | int  | Axis for the function to be applied on. 0 is columns, 1 is rows.                    | 0       |\n",
+    "| skipna       | bool | not yet implemented                                           | True    |\n",
+    "| numeric_only | bool | Only use columns of the table that are of a numeric data type.                   | False   |\n",
+    "\n",
+    "**Returns:**\n",
+    "\n",
+    "| Type       | Description                                                                              |\n",
+    "| :--------: | :--------------------------------------------------------------------------------------- |\n",
+    "| Dictionary | Map of columns and their yielded kurtosis values |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e6069cac-d260-4f80-9688-3d1ec273cd22",
+   "metadata": {},
+   "source": [
+    "**Examples:**\n",
+    "\n",
+    "Calculate the kurt across the columns of a table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4219c826-a84b-4722-9847-372d3837acdb",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>a</th>\n",
+       "      <th>b</th>\n",
+       "      <th>c</th>\n",
+       "      <th>d</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2</td>\n",
+       "      <td>2</td>\n",
+       "      <td>8</td>\n",
+       "      <td>11</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>2</td>\n",
+       "      <td>6</td>\n",
+       "      <td>9</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4</td>\n",
+       "      <td>7</td>\n",
+       "      <td>10</td>\n",
+       "      <td>14</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "pykx.Table(pykx.q('\n",
+       "a b c  d \n",
+       "---------\n",
+       "1 1 7  7 \n",
+       "2 2 8  11\n",
+       "2 6 9  14\n",
+       "4 7 10 14\n",
+       "'))"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tab = kx.Table(data=\n",
+    "    {\n",
+    "        'a': [1, 2, 2, 4],\n",
+    "        'b': [1, 2, 6, 7],\n",
+    "        'c': [7, 8, 9, 10],\n",
+    "        'd': [7, 11, 14, 14]\n",
+    "    }\n",
+    ")\n",
+    "tab"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "437ab485-bf73-4209-b63e-aa0d1bfa5d58",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>a</th>\n",
+       "      <td>2.227147</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>b</th>\n",
+       "      <td>-4.890533</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>c</th>\n",
+       "      <td>-1.2</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>d</th>\n",
+       "      <td>-0.04958678</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "pykx.Dictionary(pykx.q('\n",
+       "a| 2.227147\n",
+       "b| -4.890533\n",
+       "c| -1.2\n",
+       "d| -0.04958678\n",
+       "'))"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tab.kurt()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea3e1cf6-2304-4061-a846-1cbc0572ea9d",
+   "metadata": {},
+   "source": [
+    "Calculate the kurtosis across the rows of a table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "63312e8b-76f0-46eb-b4d7-b2213561c86e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th></th>\n",
+       "      <th></th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>-6f</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>-3.901235</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>-0.1014759</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>-0.6838056</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "pykx.Dictionary(pykx.q('\n",
+       "0| -6\n",
+       "1| -3.901235\n",
+       "2| -0.1014759\n",
+       "3| -0.6838056\n",
+       "'))"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tab.kurt(axis=1)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7bf853c5",

diff --git a/src/pykx/pandas_api/pandas_meta.py b/src/pykx/pandas_api/pandas_meta.py
@@ -154,6 +154,33 @@ def mean(self, axis: int = 0, numeric_only: bool = False):
             tab
         )
 
+    @api_return
+    def kurt(self, axis: int = 0, numeric_only: bool = False):
+        tab = self
+        if 'Keyed' in str(type(tab)):
+            tab = q.value(tab)
+        if numeric_only:
+            tab = _get_numeric_only_subtable(tab)
+
+        key_str = '' if axis == 0 else '`$string '
+        val_str = '' if axis == 0 else '"f"$value '
+        query_str = 'cols tab' if axis == 0 else 'til count tab'
+        where_str = ' where not (::)~/:r[;1]'
+        kurt_str = ('{res: x - avg x;'
+                    'n: count x;'
+                    'm2: sum res_sq: res xexp 2;'
+                    'm4: sum res_sq xexp 2;'
+                    'adj: 3 * xexp[n - 1;2] % (n - 2) * (n - 3);'
+                    'num: n * (n + 1) * (n - 1) * m4;'
+                    'den: (n - 2) * (n - 3) * m2 xexp 2;'
+                    '(num % den) - adj}')
+        return q(
+            '{[tab]'
+            f'r:{{[tab; x] ({key_str}x; {kurt_str} {val_str}tab[x])}}[tab;] each {query_str};'
+            f'(,/) {{(enlist x 0)!(enlist x 1)}} each r{where_str}}}',
+            tab
+        )
+
     @api_return
     def median(self, axis: int = 0, numeric_only: bool = False):
         tab = self

diff --git a/tests/test_pandas_api.py b/tests/test_pandas_api.py
@@ -1543,6 +1543,79 @@ def test_mean(kx, q):
         q_m = tab.mean(axis=1)
 
 
+def test_kurt(kx, q):
+    df = pd.DataFrame(
+        {
+            'a': [1, 2, 2, 4],
+            'b': [1, 2, 6, 7],
+            'c': [7, 8, 9, 10],
+            'd': [7, 11, 14, 14]
+        }
+    )
+    tab = kx.toq(df)
+    p_m = df.kurt()
+    q_m = tab.kurt()
+    for c in q.key(q_m).py():
+        assert p_m[c] == q_m[c].py()
+    p_m = df.kurt(axis=1)
+    q_m = tab.kurt(axis=1)
+    for c in range(len(q.cols(tab))):
+        assert p_m[c] == q_m[q('{`$string x}', c)].py()
+
+    q['tab'] = kx.toq(df)
+    tab = q('1!`idx xcols update idx: til count tab from tab')
+    p_m = df.kurt()
+    q_m = tab.kurt()
+    for c in q.key(q_m).py():
+        assert p_m[c] == q_m[c].py()
+    p_m = df.kurt(axis=1)
+    q_m = tab.kurt(axis=1)
+    for c in range(len(q.cols(tab)) - 1):
+        assert p_m[c] == q_m[q('{`$string x}', c)].py()
+
+    df = pd.DataFrame(
+        {
+            'a': [1, 2, 2, 4],
+            'b': [1, 2, 6, 7],
+            'c': [7, 8, 9, 10],
+            'd': ['foo', 'bar', 'baz', 'qux']
+        }
+    )
+    tab = kx.toq(df)
+    p_m = df.kurt(numeric_only=True)
+    q_m = tab.kurt(numeric_only=True)
+    for c in q.key(q_m).py():
+        assert p_m[c] == q_m[c].py()
+    p_m = df.kurt(axis=1, numeric_only=True)
+    q_m = tab.kurt(axis=1, numeric_only=True)
+    for c in range(len(q.cols(tab))):
+        assert np.isnan(p_m[c]) & np.isnan(q_m[q('{`$string x}', c)].py())
+
+    df = pd.DataFrame(
+        {
+            'a': [1, 2, 2, 4],
+            'b': [1, 2, 6, 7],
+            'c': [7, 8, 9, 10],
+            'd': [11, 12, 13, 14],
+            'e': ['foo', 'bar', 'baz', 'qux']
+        }
+    )
+    tab = kx.toq(df)
+    p_m = df.kurt(numeric_only=True)
+    q_m = tab.kurt(numeric_only=True)
+    for c in q.key(q_m).py():
+        assert p_m[c] == q_m[c].py()
+    p_m = df.kurt(axis=1, numeric_only=True)
+    q_m = tab.kurt(axis=1, numeric_only=True)
+    for c in range(len(q.cols(tab)) - 1):
+        assert p_m[c] == q_m[q('{`$string x}', c)].py()
+
+    with pytest.raises(kx.QError):
+        q_m = tab.kurt()
+    with pytest.raises(kx.QError):
+        q_m = tab.kurt(axis=1)
+
+
 def test_median(kx, q):
     df = pd.DataFrame(
         {