diff --git a/docs/source/get_started/installation.rst b/docs/source/get_started/installation.rst index 41278183..45ecbefe 100644 --- a/docs/source/get_started/installation.rst +++ b/docs/source/get_started/installation.rst @@ -6,7 +6,7 @@ Installation Installation via PyPI --------------------- -.. code-block:: none +.. code-block:: bash pip install pytorch-frame @@ -17,6 +17,19 @@ Installation via PyPI Installation from master ------------------------ -.. code-block:: none +.. code-block:: bash pip install git+https://github.com/pyg-team/pytorch-frame.git + + +Installation for development +---------------------------- + +.. code-block:: bash + + git clone https://github.com/pyg-team/pytorch-frame.git + cd pytorch-frame + pip install -e .[dev] + + # Install with optional dependencies + pip install -e .[dev,full] diff --git a/docs/source/get_started/introduction.rst b/docs/source/get_started/introduction.rst index 1e10b969..d9e9b631 100644 --- a/docs/source/get_started/introduction.rst +++ b/docs/source/get_started/introduction.rst @@ -31,18 +31,13 @@ If you would like to use your own dataset, refer to the example in :doc:`/handli .. code-block:: python >>> from torch_frame.datasets import Titanic - >>> dataset = Titanic(root='/tmp/titanic') - >>> len(dataset) 891 - >>> dataset.feat_cols ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] - >>> dataset.materialize() Titanic() - >>> dataset.df.head(5) Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked PassengerId @@ -52,7 +47,6 @@ If you would like to use your own dataset, refer to the example in :doc:`/handli 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S - :pyf:`PyTorch Frame` also supports a custom dataset, so that you can use :pyf:`PyTorch Frame` for your own problem. Let's say you prepare your :class:`pandas.DataFrame` as :obj:`df` with five columns: :obj:`cat1`, :obj:`cat2`, :obj:`num1`, :obj:`num2`, and :obj:`y`. @@ -149,13 +143,10 @@ A :class:`~torch_frame.data.TensorFrame` contains the following basic properties >>> tensor_frame.stypes [, ] - >>> tensor_frame.num_cols 7 - >>> tensor_frame.num_rows 891 - >>> tensor_frame.device device(type='cpu') @@ -163,9 +154,8 @@ We support transferring the data in a :class:`~torch_frame.data.TensorFrame` to .. code-block:: python - tensor_frame.to("cpu") - - tensor_frame.to("cuda") + >>> tensor_frame = tensor_frame.to("cpu") + >>> tensor_frame = tensor_frame.to("cuda") Once a :obj:`~torch_frame.data.Dataset` is materialized, we can retrieve column statistics on the data. For each :class:`~torch_frame.stype`, a different set of statistics is calculated. @@ -184,10 +174,8 @@ For numerical features, >>> dataset.col_to_stype {'Survived': , 'Pclass': , 'Sex': , 'Age': , 'SibSp': , 'Parch': , 'Fare': , 'Embarked': } - >>> dataset.col_stats['Sex'] {: (['male', 'female'], [577, 314])} - >>> dataset.col_stats['Age'] {: 29.69911764705882, : 14.516321150817316, : [0.42, 20.125, 28.0, 38.0, 80.0]} diff --git a/docs/source/handling_advanced_stypes/handle_heterogeneous_stypes.rst b/docs/source/handling_advanced_stypes/handle_heterogeneous_stypes.rst index a42f30a5..f4f5df4f 100644 --- a/docs/source/handling_advanced_stypes/handle_heterogeneous_stypes.rst +++ b/docs/source/handling_advanced_stypes/handle_heterogeneous_stypes.rst @@ -48,32 +48,31 @@ First, let us create a sample dataset with many different stypes. 'Embedding': list(embedding) }) - df.head() - >>> - Numerical Categorical Time Multicategorical Embedding +.. code-block:: python + + >>> df.head() + Numerical Categorical Time Multicategorical Embedding 0 44 Type 2 2023-01-01 [Category D, Category A, Category B] [0.2879910043632805, 0.38346222503494787, 0.74... 1 47 Type 2 2023-01-02 [Category C, Category A, Category B, Category D] [0.0923738894608982, 0.3540466620838102, 0.551... 2 64 Type 2 2023-01-03 [Category D, Category C] [0.3209972413734975, 0.22126268518378278, 0.14... 3 67 Type 1 2023-01-04 [Category C, Category A] [0.2603409275874047, 0.5370225213757797, 0.447... 4 67 Type 2 2023-01-05 [Category A] [0.46924917399024213, 0.8411401297855995, 0.90... - Now let's load the :class:`pandas.DataFrame` into :class:`torch_frame.data.Dataset` class so that we can generate a :class:`~torch_frame.data.tensor_frame.TensorFrame` representation from it. .. code-block:: python - dataset = Dataset( - df, col_to_stype={ - 'Numerical': stype.numerical, - 'Categorical': stype.categorical, - 'Time': stype.timestamp, - 'Multicategorical': stype.multicategorical, - 'Embedding': stype.embedding - }) - dataset.materialize() - - dataset.tensor_frame - >>> TensorFrame( + >>> dataset = Dataset( + ... df, col_to_stype={ + ... 'Numerical': stype.numerical, + ... 'Categorical': stype.categorical, + ... 'Time': stype.timestamp, + ... 'Multicategorical': stype.multicategorical, + ... 'Embedding': stype.embedding + ... }) + >>> dataset.materialize() + >>> dataset.tensor_frame + TensorFrame( num_cols=4, num_rows=100, categorical (1): ['Categorical'], @@ -82,7 +81,7 @@ Now let's load the :class:`pandas.DataFrame` into :class:`torch_frame.data.Datas embedding (1): ['Embedding'], has_target=True, device='cpu', - ) + ) For each :class:`~torch_frame.stype`, we need to specify its encoder in :obj:`stype_encoder_dict`. @@ -112,28 +111,26 @@ Now we can specify the :obj:`stype_encoder_dict` to a model of your choice. .. code-block:: python - from torch_frame.nn.models.ft_transformer import FTTransformer - - model = FTTransformer( - channels=16, - out_channels=1, - num_layers=2, - col_stats=dataset.col_stats, - col_names_dict=dataset.tensor_frame.col_names_dict, - stype_encoder_dict=stype_encoder_dict, - ) - - model(dataset.tensor_frame) - >>> tensor([[ 0.9405], - [ 0.3857], - [ 0.5265], - [-0.3747], - [ 0.7496], - [ 0.0486], - [ 0.2895], - [ 0.1326], - [ 0.4388], - [-0.1665]], grad_fn=) + >>> from torch_frame.nn.models.ft_transformer import FTTransformer + >>> model = FTTransformer( + ... channels=16, + ... out_channels=1, + ... num_layers=2, + ... col_stats=dataset.col_stats, + ... col_names_dict=dataset.tensor_frame.col_names_dict, + ... stype_encoder_dict=stype_encoder_dict, + ... ) + >>> model(dataset.tensor_frame) + tensor([[ 0.9405], + [ 0.3857], + [ 0.5265], + [-0.3747], + [ 0.7496], + [ 0.0486], + [ 0.2895], + [ 0.1326], + [ 0.4388], + [-0.1665]], grad_fn=) Auto-Inference of Semantic Types -------------------------------- @@ -142,12 +139,12 @@ We offer a simple utility function :class:`~torch_frame.utils.infer_df_stype` to .. code-block:: python - infer_df_stype(df) - >>> {'Numerical': , - 'Categorical': , - 'Time': , - 'Multicategorical': , - 'Embedding': } + >>> infer_df_stype(df) + {'Numerical': , + 'Categorical': , + 'Time': , + 'Multicategorical': , + 'Embedding': } However, the inference may not be always correct/best for your data. We recommend you to double-check the correctness yourself before proceeding. @@ -211,35 +208,30 @@ If not specified, :class:`pandas` internal :meth:`~pandas.to_datetime` function .. code-block:: python - dates = pd.date_range(start="2023-01-01", periods=5, freq='D') - - df = pd.DataFrame({ - 'Time1': dates, # ISO 8601 format (default) - 'Time2': dates.strftime('%Y-%m-%d %H:%M:%S'), - }) - - df.head() - >>> Time1 Time2 - 0 2023-01-01 2023-01-01 00:00:00 - 1 2023-01-02 2023-01-02 00:00:00 - 2 2023-01-03 2023-01-03 00:00:00 - 3 2023-01-04 2023-01-04 00:00:00 - 4 2023-01-05 2023-01-05 00:00:00 - - dataset = Dataset( - df, col_to_stype={ - 'Time1': stype.timestamp, - 'Time2': stype.timestamp, - }, col_to_time_format='%Y-%m-%d %H:%M:%S') - - dataset.materialize() - - dataset.col_stats - >>> {'Time1': {: [2023, 2023], + >>> dates = pd.date_range(start="2023-01-01", periods=5, freq='D') + >>> df = pd.DataFrame({ + ... 'Time1': dates, # ISO 8601 format (default) + ... 'Time2': dates.strftime('%Y-%m-%d %H:%M:%S'), + ... }) + >>> df.head() + Time1 Time2 + 0 2023-01-01 2023-01-01 00:00:00 + 1 2023-01-02 2023-01-02 00:00:00 + 2 2023-01-03 2023-01-03 00:00:00 + 3 2023-01-04 2023-01-04 00:00:00 + 4 2023-01-05 2023-01-05 00:00:00 + >>> dataset = Dataset( + ... df, col_to_stype={ + ... 'Time1': stype.timestamp, + ... 'Time2': stype.timestamp, + ... }, col_to_time_format='%Y-%m-%d %H:%M:%S') + >>> dataset.materialize() + >>> dataset.col_stats + {'Time1': {: [2023, 2023], : tensor([2023, 0, 4, 3, 0, 0, 0]), : tensor([2023, 0, 0, 6, 0, 0, 0]), : tensor([2023, 0, 2, 1, 0, 0, 0])}, - 'Time2': {: [2023, 2023], + 'Time2': {: [2023, 2023], : tensor([2023, 0, 4, 3, 0, 0, 0]), : tensor([2023, 0, 0, 6, 0, 0, 0]), : tensor([2023, 0, 2, 1, 0, 0, 0])}} diff --git a/docs/source/handling_advanced_stypes/handle_text.rst b/docs/source/handling_advanced_stypes/handle_text.rst index 71f10548..f0a92610 100644 --- a/docs/source/handling_advanced_stypes/handle_text.rst +++ b/docs/source/handling_advanced_stypes/handle_text.rst @@ -57,7 +57,6 @@ Next, we create a text encoder class that encodes a list of strings into text em .. code-block:: python - from typing import List import torch from torch import Tensor from sentence_transformers import SentenceTransformer @@ -66,7 +65,7 @@ Next, we create a text encoder class that encodes a list of strings into text em def __init__(self, device: torch.device): self.model = SentenceTransformer('all-distilroberta-v1', device=device) - def __call__(self, sentences: List[str]) -> Tensor: + def __call__(self, sentences: list[str]) -> Tensor: # Encode a list of batch_size sentences into a PyTorch Tensor of # size [batch_size, emb_dim] embeddings = self.model.encode( @@ -117,35 +116,31 @@ Once :obj:`col_to_text_embedder_cfg` is specified, we can pass it to .. code-block:: python - import torch_frame - from torch_frame.datasets import MultimodalTextBenchmark + >>> import torch_frame + >>> from torch_frame.datasets import MultimodalTextBenchmark + >>> dataset = MultimodalTextBenchmark( + ... root='/tmp/multimodal_text_benchmark/wine_reviews', + ... name='wine_reviews', + ... col_to_text_embedder_cfg=col_to_text_embedder_cfg, + ... ) + >>> dataset.feat_cols # This dataset contains one text column `description` + ['description', 'country', 'province', 'points', 'price'] - dataset = MultimodalTextBenchmark( - root='/tmp/multimodal_text_benchmark/wine_reviews', - name='wine_reviews', - col_to_text_embedder_cfg=col_to_text_embedder_cfg, - ) - - dataset.feat_cols # This dataset contains one text column `description` - >>> ['description', 'country', 'province', 'points', 'price'] - - dataset.col_to_stype['description'] - >>> + >>> dataset.col_to_stype['description'] + We then call :obj:`dataset.materialize(path=...)`, which will use text embedding models to pre-encode :obj:`text_embedded` columns based on the given :obj:`col_to_text_embedder_cfg`. .. code-block:: python - # Pre-encode text columns based on col_to_text_embedder_cfg. This may take a while. - dataset.materialize(path='/tmp/multimodal_text_benchmark/wine_reviews/data.pt') - - len(dataset) - >>> 105154 - - # Text embeddings are stored as MultiNestedTensor - dataset.tensor_frame.feat_dict[torch_frame.embedding] - >>> MultiNestedTensor(num_rows=105154, num_cols=1, device='cpu') + >>> # Pre-encode text columns based on col_to_text_embedder_cfg. This may take a while. + >>> dataset.materialize(path='/tmp/multimodal_text_benchmark/wine_reviews/data.pt') + >>> len(dataset) + 105154 + >>> # Text embeddings are stored as MultiNestedTensor + >>> dataset.tensor_frame.feat_dict[torch_frame.embedding] + MultiNestedTensor(num_rows=105154, num_cols=1, device='cpu') It is strongly recommended to specify the :obj:`path` during :meth:`~torch_frame.data.Dataset.materialize`. It will cache generated :class:`~torch_frame.data.TensorFrame`, therefore, avoiding embedding texts in @@ -206,7 +201,6 @@ Let's first create a tokenization class that tokenizes a list of strings to a di .. code-block:: python - from typing import List from transformers import AutoTokenizer from torch_frame.typing import TextTokenizationOutputs @@ -214,7 +208,7 @@ Let's first create a tokenization class that tokenizes a list of strings to a di def __init__(self): self.tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased') - def __call__(self, sentences: List[str]) -> TextTokenizationOutputs: + def __call__(self, sentences: list[str]) -> TextTokenizationOutputs: # Tokenize batches of sentences return self.tokenizer( sentences, @@ -223,8 +217,6 @@ Let's first create a tokenization class that tokenizes a list of strings to a di return_tensors='pt', ) - - Here, the output :class:`~torch_frame.typing.TextTokenizationOutputs` is a dictionary, where the keys include :obj:`input_ids` and :obj:`attention_mask`, and the values contain :pytorch:`PyTorch` tensors of tokens and attention masks. @@ -252,9 +244,9 @@ text columns with :class:`stype.text_tokenized`. # Prepare text_tokenizer0 and text_tokenizer1 for text_col0 and text_col1, respectively. col_to_text_tokenizer_cfg = { "text_col0": - TextTokenizerConfig(text_tokenizer=text_tokenizer0, batch_size=10000), + TextTokenizerConfig(text_tokenizer=text_tokenizer0, batch_size=10_000), "text_col1": - TextTokenizerConfig(text_tokenizer=text_tokenizer1, batch_size=20000), + TextTokenizerConfig(text_tokenizer=text_tokenizer1, batch_size=20_000), } @@ -266,31 +258,27 @@ Once :obj:`col_to_text_tokenizer_cfg` is specified, we can pass it to .. code-block:: python - import torch_frame - from torch_frame.datasets import MultimodalTextBenchmark - - dataset = MultimodalTextBenchmark( - root='/tmp/multimodal_text_benchmark/wine_reviews', - name='wine_reviews', - text_stype=torch_frame.text_tokenized, - col_to_text_tokenizer_cfg=col_to_text_tokenizer_cfg, - ) - - dataset.col_to_stype['description'] - >>> - + >>> import torch_frame + >>> from torch_frame.datasets import MultimodalTextBenchmark + >>> dataset = MultimodalTextBenchmark( + ... root='/tmp/multimodal_text_benchmark/wine_reviews', + ... name='wine_reviews', + ... text_stype=torch_frame.text_tokenized, + ... col_to_text_tokenizer_cfg=col_to_text_tokenizer_cfg, + ... ) + >>> dataset.col_to_stype['description'] + We then call :obj:`dataset.materialize()`, which will use the text tokenizers to pre-tokenize :obj:`text_tokenized` columns based on the given :obj:`col_to_text_tokenizer_cfg`. .. code-block:: python - # Pre-encode text columns based on col_to_text_tokenizer_cfg. - dataset.materialize() - - # A dictionary of text tokenization results - dataset.tensor_frame.feat_dict[torch_frame.text_tokenized] - >>> {'input_ids': MultiNestedTensor(num_rows=105154, num_cols=1, device='cpu'), 'attention_mask': MultiNestedTensor(num_rows=105154, num_cols=1, device='cpu')} + >>> # Pre-encode text columns based on col_to_text_tokenizer_cfg. + >>> dataset.materialize() + >>> # A dictionary of text tokenization results + >>> dataset.tensor_frame.feat_dict[torch_frame.text_tokenized] + {'input_ids': MultiNestedTensor(num_rows=105154, num_cols=1, device='cpu'), 'attention_mask': MultiNestedTensor(num_rows=105154, num_cols=1, device='cpu')} Notice that we use a dictionary of :obj:`~torch_frame.data.MultiNestedTensor` to store the tokenized results.