Sparse Indexes in DynamoDB#

Keywords: AWS, Amazon, DynamoDB

所谓 Spare Indexes 就是当 Index 里的 PK 或者 SK 在 Base Table 的 Item 中不存在时, 这个 Item 就不会在 Index 中出现. 这适合你只需要把一小部分的 Item 放到 Index 中的情况.

官方文档中有个很好的例子. 电商有一个表记录了所有的 Order. PK 是 CustomerId, SK 是 OrderId. 用户登录后需要查看已经下单, 但还没有送达的 Order. 偶尔需要查看所有的 Order 的历史记录. 显然前者是更高频的需求. 考虑到一个 Customer 一辈子下的单不可能太夸张, 所以你完全可以用 CustomerId 获得所有 Order, 然后再内存中进行排序. 但考虑到一般顶多有 1 到 2 个订单处于这种情况, 为了这 2 个订单查询了几十个订单还是比较浪费.

这里介绍一个利用 Sparse Indexes 对齐进行优化的办法. 你可能有一个 Attribute 叫 CreateAt, 记录了订单创建的时间, 还有一个 Attribute 叫做 Status, 它的值可能是 pending, delivering, delivered. 这里你不要直接用 Status 做 index, 而是专门创建一个 attribuge 叫做 OrderCreateAt, 它是一个时间戳, 只有这个订单处于 pending, delivering 的状态时它有值. 处于 delivered 的状态时候这个 attribute 就没有了. 然后你可以创建一个 Index, PK 是 CustomerId, SK 是 OrderCreateAt. 这样你用这个 index 可以轻松找到谋个用户还没有送达的订单, 并按照时间顺序排列.

Reference:

[1]:
import pynamodb_mate as pm
from boto_session_manager import BotoSesManager

bsm = BotoSesManager(profile_name="bmt_app_dev_us_east_1")
with bsm.awscli():
    pm.Connection()

Example 1#

下面这个例子说明了 sort key 不能是 Null.

[2]:
class Model1(pm.Model):
    class Meta:
        table_name = "sparce_indexes_test_1"
        region = "us-east-1"
        billing_mode = pm.PAY_PER_REQUEST_BILLING_MODE

    pk = pm.UnicodeAttribute(hash_key=True)
    sk = pm.UnicodeAttribute(range_key=True)


Model1.create_table(wait=True)

# Not gonna work
model = Model1(doc_id="pk-1", sk=None)
model.save()
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[2], line 14
     11 Model1.create_table(wait=True)
     13 # Not gonna work
---> 14 model = Model1(doc_id="pk-1", sk=None)
     15 model.save()

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb_mate/models.py:95, in Model.__init__(self, hash_key, range_key, **attributes)
     89 def __init__(
     90     self,
     91     hash_key: T.Optional[T.Any] = None,
     92     range_key: T.Optional[T.Any] = None,
     93     **attributes,
     94 ):
---> 95     super().__init__(hash_key, range_key, **attributes)
     96     self.__post_init__()

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/models.py:322, in Model.__init__(self, hash_key, range_key, _user_instantiated, **attributes)
    320         raise ValueError(f"This model has no range key, but a range key value was provided: {range_key}")
    321     attributes[self._range_keyname] = range_key
--> 322 super(Model, self).__init__(_user_instantiated=_user_instantiated, **attributes)

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/attributes.py:278, in AttributeContainer.__init__(self, _user_instantiated, **attributes)
    276 self._set_discriminator()
    277 self._set_defaults(_user_instantiated=_user_instantiated)
--> 278 self._set_attributes(**attributes)

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/attributes.py:337, in AttributeContainer._set_attributes(self, **attributes)
    335 for attr_name, attr_value in attributes.items():
    336     if attr_name not in self.get_attributes():
--> 337         raise ValueError("Attribute {} specified does not exist".format(attr_name))
    338     setattr(self, attr_name, attr_value)

ValueError: Attribute doc_id specified does not exist

Example 2#

下面这个例子说明了如果你的 Index 只有 PK, 且 PK 这个 attribute 在 Base table 中的 Item 中不存在, 那么这个 Item 就不会出现在 Index 里.

[5]:
class Model2Index(pm.GlobalSecondaryIndex):
    class Meta:
        index = "model2-index"
        projection = pm.KeysOnlyProjection()

    gsi_pk = pm.UnicodeAttribute(hash_key=True)


class Model2(pm.Model):
    class Meta:
        table_name = "sparce_indexes_test_2"
        region = "us-east-1"
        billing_mode = pm.PAY_PER_REQUEST_BILLING_MODE

    pk = pm.UnicodeAttribute(hash_key=True)
    gsi_pk = pm.UnicodeAttribute(null=True)

    index = Model2Index()

Model2.create_table(wait=True)
[6]:
Model2(pk="id-1", gsi_pk=None).save()
Model2(pk="id-2", gsi_pk="id-2-gsi-pk").save()
[6]:
{'ConsumedCapacity': {'CapacityUnits': 2.0,
  'TableName': 'sparce_indexes_test_2'}}
[7]:
for i in Model2.index.query(hash_key=None):
    print(i)
---------------------------------------------------------------------------
VerboseClientError                        Traceback (most recent call last)
File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/connection/base.py:1396, in Connection.query(self, table_name, hash_key, range_key_condition, filter_condition, attributes_to_get, consistent_read, exclusive_start_key, index_name, limit, return_consumed_capacity, scan_index_forward, select, settings)
   1395 try:
-> 1396     return self.dispatch(QUERY, operation_kwargs, settings)
   1397 except BOTOCORE_EXCEPTIONS as e:

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/connection/base.py:347, in Connection.dispatch(self, operation_name, operation_kwargs, settings)
    346 self.send_pre_boto_callback(operation_name, req_uuid, table_name)
--> 347 data = self._make_api_call(operation_name, operation_kwargs, settings)
    348 self.send_post_boto_callback(operation_name, req_uuid, table_name)

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/connection/base.py:475, in Connection._make_api_call(self, operation_name, operation_kwargs, settings)
    474 try:
--> 475     raise VerboseClientError(
    476         botocore_expected_format,
    477         operation_name,
    478         verbose_properties,
    479         cancellation_reasons=(
    480             (
    481                 CancellationReason(
    482                     code=d['Code'],
    483                     message=d.get('Message'),
    484                 ) if d['Code'] != 'None' else None
    485             )
    486             for d in data.get('CancellationReasons', [])
    487         ),
    488     )
    489 except VerboseClientError as e:

VerboseClientError: An error occurred (ValidationException) on request (I1NDF93NG76SP0Q6RC410310FBVV4KQNSO5AEMVJF66Q9ASUAAJG) on table (sparce_indexes_test_2) when calling the Query operation: ExpressionAttributeValues contains invalid value: Supplied AttributeValue is empty, must contain exactly one of the supported datatypes for key :0

During handling of the above exception, another exception occurred:

QueryError                                Traceback (most recent call last)
Cell In[7], line 1
----> 1 for i in Model2.index.query(hash_key=None):
      2     print(i)

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/pagination.py:193, in ResultIterator.__next__(self)
    190     raise StopIteration
    192 while self._index == self._count:
--> 193     self._get_next_page()
    195 item = self._items[self._index]
    196 self._index += 1

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/pagination.py:179, in ResultIterator._get_next_page(self)
    178 def _get_next_page(self) -> None:
--> 179     page = next(self.page_iter)
    180     self._count = page[CAMEL_COUNT]
    181     self._items = page.get(ITEMS)  # not returned if 'Select' is set to 'COUNT'

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/pagination.py:113, in PageIterator.__next__(self)
    111     self._rate_limiter.acquire()
    112     self._kwargs['return_consumed_capacity'] = TOTAL
--> 113 page = self._operation(*self._args, settings=self._settings, **self._kwargs)
    114 self._last_evaluated_key = page.get(LAST_EVALUATED_KEY)
    115 self._is_last_page = self._last_evaluated_key is None

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/connection/table.py:273, in TableConnection.query(self, hash_key, range_key_condition, filter_condition, attributes_to_get, consistent_read, exclusive_start_key, index_name, limit, return_consumed_capacity, scan_index_forward, select, settings)
    255 def query(
    256     self,
    257     hash_key: str,
   (...)
    268     settings: OperationSettings = OperationSettings.default,
    269 ) -> Dict:
    270     """
    271     Performs the Query operation and returns the result
    272     """
--> 273     return self.connection.query(
    274         self.table_name,
    275         hash_key,
    276         range_key_condition=range_key_condition,
    277         filter_condition=filter_condition,
    278         attributes_to_get=attributes_to_get,
    279         consistent_read=consistent_read,
    280         exclusive_start_key=exclusive_start_key,
    281         index_name=index_name,
    282         limit=limit,
    283         return_consumed_capacity=return_consumed_capacity,
    284         scan_index_forward=scan_index_forward,
    285         select=select,
    286         settings=settings,
    287     )

File ~/Documents/GitHub/learn_aws-project/.venv/lib/python3.10/site-packages/pynamodb/connection/base.py:1398, in Connection.query(self, table_name, hash_key, range_key_condition, filter_condition, attributes_to_get, consistent_read, exclusive_start_key, index_name, limit, return_consumed_capacity, scan_index_forward, select, settings)
   1396     return self.dispatch(QUERY, operation_kwargs, settings)
   1397 except BOTOCORE_EXCEPTIONS as e:
-> 1398     raise QueryError("Failed to query items: {}".format(e), e)

QueryError: Failed to query items: An error occurred (ValidationException) on request (I1NDF93NG76SP0Q6RC410310FBVV4KQNSO5AEMVJF66Q9ASUAAJG) on table (sparce_indexes_test_2) when calling the Query operation: ExpressionAttributeValues contains invalid value: Supplied AttributeValue is empty, must contain exactly one of the supported datatypes for key :0
[8]:
for i in Model2.index.query(hash_key="id-2-gsi-pk"):
    print(i.attribute_values)
{'gsi_pk': 'id-2-gsi-pk', 'pk': 'id-2'}

Example 3#

下面这个例子说明了如果你的 Index 有 PK 和 SK, 且 PK 和 SK 的 attribute 中的任意一个在 Base table 中的 Item 中不存在, 那么这个 Item 就不会出现在 Index 里.

[9]:
class Model3Index(pm.GlobalSecondaryIndex):
    class Meta:
        index = "model3-index"
        projection = pm.KeysOnlyProjection()

    gsi_pk = pm.UnicodeAttribute(hash_key=True)
    gsi_sk = pm.UnicodeAttribute(range_key=True)


class Model3(pm.Model):
    class Meta:
        table_name = "sparce_indexes_test_3"
        region = "us-east-1"
        billing_mode = pm.PAY_PER_REQUEST_BILLING_MODE

    pk = pm.UnicodeAttribute(hash_key=True)
    gsi_pk = pm.UnicodeAttribute(null=True)
    gsi_sk = pm.UnicodeAttribute(null=True)

    index = Model3Index()

Model3.create_table(wait=True)
[10]:
Model3(pk="id-1", gsi_pk=None, gsi_sk=None).save()
Model3(pk="id-2", gsi_pk="id-2-gsi-pk", gsi_sk=None).save()
Model3(pk="id-3", gsi_pk=None, gsi_sk="id-3-gsi-sk").save()
Model3(pk="id-4", gsi_pk="id-4-gsi-pk", gsi_sk="id-4-gsi-sk").save() # only this will be in the index
[10]:
{'ConsumedCapacity': {'CapacityUnits': 2.0,
  'TableName': 'sparce_indexes_test_3'}}
[11]:
# Nothing there
for i in Model3.index.query(hash_key="id-2-gsi-pk"):
    print(i.attribute_values)
[12]:
for i in Model3.index.query(hash_key="id-4-gsi-pk"):
    print(i.attribute_values)
{'gsi_pk': 'id-4-gsi-pk', 'gsi_sk': 'id-4-gsi-sk', 'pk': 'id-4'}

Example 4#

这个例子是说明在 GSI 中, PK 和 SK 合起来并不需要是唯一的. GSI 只是像 DynamoDB Table, 但它不是一个真正的 GSI Table.

[13]:
class Model4Index(pm.GlobalSecondaryIndex):
    class Meta:
        index = "model4-index"
        projection = pm.KeysOnlyProjection()

    gsi_pk = pm.UnicodeAttribute(hash_key=True)
    gsi_sk = pm.UnicodeAttribute(range_key=True)


class Model4(pm.Model):
    class Meta:
        table_name = "sparce_indexes_test_4"
        region = "us-east-1"
        billing_mode = pm.PAY_PER_REQUEST_BILLING_MODE

    pk = pm.UnicodeAttribute(hash_key=True)
    gsi_pk = pm.UnicodeAttribute(null=True)
    gsi_sk = pm.UnicodeAttribute(null=True)

    index = Model4Index()

Model4.create_table(wait=True)
[14]:
Model4(pk="id-1", gsi_pk="gsi-pk", gsi_sk="gsi-sk").save()
Model4(pk="id-2", gsi_pk="gsi-pk", gsi_sk="gsi-sk").save()
[14]:
{'ConsumedCapacity': {'CapacityUnits': 1.0,
  'TableName': 'sparce_indexes_test_4'}}
[15]:
for i in Model4.index.query(hash_key="gsi-pk"):
    print(i.attribute_values)
{'gsi_pk': 'gsi-pk', 'gsi_sk': 'gsi-sk', 'pk': 'id-1'}
{'gsi_pk': 'gsi-pk', 'gsi_sk': 'gsi-sk', 'pk': 'id-2'}
[ ]: