|
6 | 6 | import itertools
|
7 | 7 | import operator
|
8 | 8 | import pickle
|
| 9 | +import types |
9 | 10 | import warnings
|
10 | 11 | from collections import abc
|
11 | 12 | from typing import (
|
@@ -91,6 +92,10 @@ def _dtypes(self):
|
91 | 92 | zip(self._data.names, (col.dtype for col in self._data.columns))
|
92 | 93 | )
|
93 | 94 |
|
| 95 | + @property |
| 96 | + def ndim(self) -> int: |
| 97 | + raise NotImplementedError() |
| 98 | + |
94 | 99 | @_cudf_nvtx_annotate
|
95 | 100 | def serialize(self):
|
96 | 101 | # TODO: See if self._data can be serialized outright
|
@@ -417,51 +422,60 @@ def __arrow_array__(self, type=None):
|
417 | 422 | @_cudf_nvtx_annotate
|
418 | 423 | def _to_array(
|
419 | 424 | self,
|
420 |
| - get_column_values: Callable, |
421 |
| - make_empty_matrix: Callable, |
| 425 | + get_array: Callable, |
| 426 | + module: types.ModuleType, |
| 427 | + copy: bool, |
422 | 428 | dtype: Union[Dtype, None] = None,
|
423 | 429 | na_value=None,
|
424 |
| - ) -> Union[cupy.ndarray, np.ndarray]: |
| 430 | + ) -> Union[cupy.ndarray, numpy.ndarray]: |
425 | 431 | # Internal function to implement to_cupy and to_numpy, which are nearly
|
426 | 432 | # identical except for the attribute they access to generate values.
|
427 | 433 |
|
428 |
| - def get_column_values_na(col): |
| 434 | + def to_array( |
| 435 | + col: ColumnBase, dtype: np.dtype |
| 436 | + ) -> Union[cupy.ndarray, numpy.ndarray]: |
429 | 437 | if na_value is not None:
|
430 | 438 | col = col.fillna(na_value)
|
431 |
| - return get_column_values(col) |
| 439 | + array = get_array(col) |
| 440 | + casted_array = module.asarray(array, dtype=dtype) |
| 441 | + if copy and casted_array is array: |
| 442 | + # Don't double copy after asarray |
| 443 | + casted_array = casted_array.copy() |
| 444 | + return casted_array |
432 | 445 |
|
433 |
| - # Early exit for an empty Frame. |
434 | 446 | ncol = self._num_columns
|
435 | 447 | if ncol == 0:
|
436 |
| - return make_empty_matrix( |
437 |
| - shape=(len(self), ncol), dtype=np.dtype("float64"), order="F" |
| 448 | + return module.empty( |
| 449 | + shape=(len(self), ncol), |
| 450 | + dtype=numpy.dtype("float64"), |
| 451 | + order="F", |
438 | 452 | )
|
439 | 453 |
|
440 | 454 | if dtype is None:
|
441 |
| - dtypes = [col.dtype for col in self._data.values()] |
442 |
| - for dtype in dtypes: |
443 |
| - if isinstance( |
444 |
| - dtype, |
445 |
| - ( |
446 |
| - cudf.ListDtype, |
447 |
| - cudf.core.dtypes.DecimalDtype, |
448 |
| - cudf.StructDtype, |
449 |
| - ), |
450 |
| - ): |
451 |
| - raise NotImplementedError( |
452 |
| - f"{dtype} cannot be exposed as a cupy array" |
453 |
| - ) |
454 |
| - dtype = find_common_type(dtypes) |
| 455 | + if ncol == 1: |
| 456 | + dtype = next(iter(self._data.values())).dtype |
| 457 | + else: |
| 458 | + dtype = find_common_type( |
| 459 | + [col.dtype for col in self._data.values()] |
| 460 | + ) |
455 | 461 |
|
456 |
| - matrix = make_empty_matrix( |
457 |
| - shape=(len(self), ncol), dtype=dtype, order="F" |
458 |
| - ) |
459 |
| - for i, col in enumerate(self._data.values()): |
460 |
| - # TODO: col.values may fail if there is nullable data or an |
461 |
| - # unsupported dtype. We may want to catch and provide a more |
462 |
| - # suitable error. |
463 |
| - matrix[:, i] = get_column_values_na(col) |
464 |
| - return matrix |
| 462 | + if not isinstance(dtype, numpy.dtype): |
| 463 | + raise NotImplementedError( |
| 464 | + f"{dtype} cannot be exposed as an array" |
| 465 | + ) |
| 466 | + |
| 467 | + if self.ndim == 1: |
| 468 | + return to_array(self._data.columns[0], dtype) |
| 469 | + else: |
| 470 | + matrix = module.empty( |
| 471 | + shape=(len(self), ncol), dtype=dtype, order="F" |
| 472 | + ) |
| 473 | + for i, col in enumerate(self._data.values()): |
| 474 | + # TODO: col.values may fail if there is nullable data or an |
| 475 | + # unsupported dtype. We may want to catch and provide a more |
| 476 | + # suitable error. |
| 477 | + matrix[:, i] = to_array(col, dtype) |
| 478 | + return matrix |
465 | 479 |
|
466 | 480 | # TODO: As of now, calling cupy.asarray is _much_ faster than calling
|
467 | 481 | # to_cupy. We should investigate the reasons why and whether we can provide
|
@@ -496,10 +510,9 @@ def to_cupy(
|
496 | 510 | cupy.ndarray
|
497 | 511 | """
|
498 | 512 | return self._to_array(
|
499 |
| - (lambda col: col.values.copy()) |
500 |
| - if copy |
501 |
| - else (lambda col: col.values), |
502 |
| - cupy.empty, |
| 513 | + lambda col: col.values, |
| 514 | + cupy, |
| 515 | + copy, |
503 | 516 | dtype,
|
504 | 517 | na_value,
|
505 | 518 | )
|
@@ -536,7 +549,7 @@ def to_numpy(
|
536 | 549 | )
|
537 | 550 |
|
538 | 551 | return self._to_array(
|
539 |
| - (lambda col: col.values_host), np.empty, dtype, na_value |
| 552 | + lambda col: col.values_host, numpy, copy, dtype, na_value |
540 | 553 | )
|
541 | 554 |
|
542 | 555 | @_cudf_nvtx_annotate
|
|
0 commit comments