Chunked search reindexing

2021-02-14 12:45:11 +01:00 · 2021-02-14 12:45:11 +01:00 · a4d16c2710
parent 37b1ba69ab
commit a4d16c2710
2 changed files with 39 additions and 10 deletions
--- a/search/management/commands/rebuild_search_index.py
+++ b/search/management/commands/rebuild_search_index.py
@ -6,6 +6,7 @@ from comments.models import Comment
 from posts.models.post import Post
 from search.models import SearchIndex
 from users.models.user import User
+from utils.queryset import chunked_queryset

 log = logging.getLogger(__name__)

@ -16,17 +17,26 @@ class Command(BaseCommand):
    def handle(self, *args, **options):
        SearchIndex.objects.all().delete()

-        for comment in Comment.visible_objects().filter(is_deleted=False, post__is_visible=True):
-            self.stdout.write(f"Indexing comment: {comment.id}")
-            SearchIndex.update_comment_index(comment)
+        for chunk in chunked_queryset(
+            Comment.visible_objects().filter(is_deleted=False, post__is_visible=True).order_by("-created_at")
+        ):
+            for comment in chunk:
+                self.stdout.write(f"Indexing comment: {comment.id}")
+                SearchIndex.update_comment_index(comment)

-        for post in Post.visible_objects().filter(is_shadow_banned=False):
-            self.stdout.write(f"Indexing post: {post.slug}")
-            SearchIndex.update_post_index(post)
+        for chunk in chunked_queryset(
+            Post.visible_objects().filter(is_shadow_banned=False).order_by("-created_at")
+        ):
+            for post in chunk:
+                self.stdout.write(f"Indexing post: {post.slug}")
+                SearchIndex.update_post_index(post)

-        for user in User.objects.filter(moderation_status=User.MODERATION_STATUS_APPROVED):
-            self.stdout.write(f"Indexing user: {user.slug}")
-            SearchIndex.update_user_index(user)
-            SearchIndex.update_user_tags(user)
+        for chunk in chunked_queryset(
+            User.objects.filter(moderation_status=User.MODERATION_STATUS_APPROVED).order_by("-created_at")
+        ):
+            for user in chunk:
+                self.stdout.write(f"Indexing user: {user.slug}")
+                SearchIndex.update_user_index(user)
+                SearchIndex.update_user_tags(user)

        self.stdout.write("Done 🥙")
--- a/utils/queryset.py
+++ b/utils/queryset.py
@ -0,0 +1,19 @@
+def chunked_queryset(queryset, chunk_size=1000):
+    start_pk = 0
+    queryset = queryset.order_by("pk")
+
+    while True:
+        # no entries left
+        if not queryset.filter(pk__gt=start_pk).exists():
+            break
+
+        try:
+            # fetch chunk_size entries
+            end_pk = queryset.filter(pk__gt=start_pk).values_list("pk", flat=True)[chunk_size - 1]
+        except IndexError:
+            # fetch rest entries if less than chunk_size left
+            end_pk = queryset.values_list("pk", flat=True).last()
+
+        yield queryset.filter(pk__gt=start_pk).filter(pk__lte=end_pk)
+
+        start_pk = end_pk