From f4e7a80600f4bef88a52270dc73a0ae369f272c4 Mon Sep 17 00:00:00 2001 From: Roman Rizzi Date: Tue, 27 Jun 2023 11:44:34 -0300 Subject: [PATCH] DEV: Cache summarization strategy results. (#22230) Updates the interface for implementing summarization strategies and adds a cache layer to summarize topics once. The cache stores the final summary and each chunk used to build it, which will be useful when we have to extend or rebuild it. --- .../discourse/app/components/topic-summary.js | 5 +- app/controllers/topics_controller.rb | 13 +- app/models/post.rb | 2 +- app/models/summary_section.rb | 21 +++ app/services/topic_summarization.rb | 90 ++++++++++++ ...608163854_create_summary_sections_table.rb | 16 +++ lib/summarization/base.rb | 69 +++++++--- .../chat/api/summaries_controller.rb | 2 +- .../spec/system/chat_summarization_spec.rb | 6 +- spec/lib/summarization/base_spec.rb | 4 +- spec/services/topic_summarization_spec.rb | 129 ++++++++++++++++++ spec/support/dummy_custom_summarization.rb | 10 +- spec/system/topic_summarization_spec.rb | 6 +- 13 files changed, 332 insertions(+), 41 deletions(-) create mode 100644 app/models/summary_section.rb create mode 100644 app/services/topic_summarization.rb create mode 100644 db/migrate/20230608163854_create_summary_sections_table.rb create mode 100644 spec/services/topic_summarization_spec.rb diff --git a/app/assets/javascripts/discourse/app/components/topic-summary.js b/app/assets/javascripts/discourse/app/components/topic-summary.js index 902344fc5bd..c86b03777ec 100644 --- a/app/assets/javascripts/discourse/app/components/topic-summary.js +++ b/app/assets/javascripts/discourse/app/components/topic-summary.js @@ -4,6 +4,7 @@ import { ajax } from "discourse/lib/ajax"; import { popupAjaxError } from "discourse/lib/ajax-error"; import { action } from "@ember/object"; import { schedule } from "@ember/runloop"; +import { cookAsync } from "discourse/lib/text"; export default class TopicSummary extends Component { @tracked loading = false; @@ -16,7 +17,9 @@ export default class TopicSummary extends Component { ajax(`/t/${this.args.topicId}/strategy-summary`) .then((data) => { - this.summary = data.summary; + cookAsync(data.summary).then((cooked) => { + this.summary = cooked; + }); }) .catch(popupAjaxError) .finally(() => (this.loading = false)); diff --git a/app/controllers/topics_controller.rb b/app/controllers/topics_controller.rb index eb4ff526652..3bd7beb72c6 100644 --- a/app/controllers/topics_controller.rb +++ b/app/controllers/topics_controller.rb @@ -1178,18 +1178,7 @@ class TopicsController < ApplicationController RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! - hijack do - summary_opts = { - filter: "summary", - exclude_deleted_users: true, - exclude_hidden: true, - show_deleted: false, - } - - content = TopicView.new(topic, current_user, summary_opts).posts.pluck(:raw).join("\n") - - render json: { summary: strategy.summarize(content) } - end + hijack { render json: { summary: TopicSummarization.new(strategy).summarize(topic) } } end private diff --git a/app/models/post.rb b/app/models/post.rb index 014c72999eb..adccaf0d003 100644 --- a/app/models/post.rb +++ b/app/models/post.rb @@ -437,7 +437,7 @@ class Post < ActiveRecord::Base # percent rank has tons of ties where(topic_id: topic_id).where( [ - "id = ANY( + "posts.id = ANY( ( SELECT posts.id FROM posts diff --git a/app/models/summary_section.rb b/app/models/summary_section.rb new file mode 100644 index 00000000000..613c59b54e6 --- /dev/null +++ b/app/models/summary_section.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +class SummarySection < ActiveRecord::Base + belongs_to :target, polymorphic: true +end + +# == Schema Information +# +# Table name: summary_sections +# +# id :bigint not null, primary key +# target_id :integer not null +# target_type :string not null +# content_range :int4range +# summarized_text :string not null +# meta_section_id :integer +# original_content_sha :string not null +# algorithm :string not null +# created_at :datetime not null +# updated_at :datetime not null +# diff --git a/app/services/topic_summarization.rb b/app/services/topic_summarization.rb new file mode 100644 index 00000000000..b13452a739b --- /dev/null +++ b/app/services/topic_summarization.rb @@ -0,0 +1,90 @@ +# frozen_string_literal: true + +class TopicSummarization + def initialize(strategy) + @strategy = strategy + end + + def summarize(topic) + DistributedMutex.synchronize("toppic_summarization_#{topic.id}") do + existing_summary = SummarySection.find_by(target: topic, meta_section_id: nil) + return existing_summary.summarized_text if existing_summary + + content = { + resource_path: "#{Discourse.base_path}/t/-/#{topic.id}", + content_title: topic.title, + contents: [], + } + + targets_data = summary_targets(topic).pluck(:post_number, :raw, :username) + + targets_data.map do |(pn, raw, username)| + content[:contents] << { poster: username, id: pn, text: raw } + end + + summarization_result = strategy.summarize(content) + + cached_summary(summarization_result, targets_data.map(&:first), topic) + + summarization_result[:summary] + end + end + + def summary_targets(topic) + topic.has_summary? ? best_replies(topic) : pick_selection(topic) + end + + private + + attr_reader :strategy + + def best_replies(topic) + Post + .summary(topic.id) + .where("post_type = ?", Post.types[:regular]) + .where("NOT hidden") + .joins(:user) + .order(:post_number) + end + + def pick_selection(topic) + posts = + Post + .where(topic_id: topic.id) + .where("post_type = ?", Post.types[:regular]) + .where("NOT hidden") + .order(:post_number) + + post_numbers = posts.limit(5).pluck(:post_number) + post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number) + post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number) + + Post + .where(topic_id: topic.id) + .joins(:user) + .where("post_number in (?)", post_numbers) + .order(:post_number) + end + + def cached_summary(result, post_numbers, topic) + main_summary = + SummarySection.create!( + target: topic, + algorithm: strategy.model, + content_range: (post_numbers.first..post_numbers.last), + summarized_text: result[:summary], + original_content_sha: Digest::SHA256.hexdigest(post_numbers.join), + ) + + result[:chunks].each do |chunk| + SummarySection.create!( + target: topic, + algorithm: strategy.model, + content_range: chunk[:ids].min..chunk[:ids].max, + summarized_text: chunk[:summary], + original_content_sha: Digest::SHA256.hexdigest(chunk[:ids].join), + meta_section_id: main_summary.id, + ) + end + end +end diff --git a/db/migrate/20230608163854_create_summary_sections_table.rb b/db/migrate/20230608163854_create_summary_sections_table.rb new file mode 100644 index 00000000000..b325f5f14a1 --- /dev/null +++ b/db/migrate/20230608163854_create_summary_sections_table.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +class CreateSummarySectionsTable < ActiveRecord::Migration[7.0] + def change + create_table :summary_sections do |t| + t.integer :target_id, null: false + t.string :target_type, null: false + t.int4range :content_range + t.string :summarized_text, null: false + t.integer :meta_section_id + t.string :original_content_sha, null: false + t.string :algorithm, null: false + t.timestamps + end + end +end diff --git a/lib/summarization/base.rb b/lib/summarization/base.rb index d4886476bbc..e70ac1c2a57 100644 --- a/lib/summarization/base.rb +++ b/lib/summarization/base.rb @@ -1,27 +1,28 @@ # frozen_string_literal: true +# Base class that defines the interface that every summarization +# strategy must implement. +# Above each method, you'll find an explanation of what +# it does and what it should return. + module Summarization class Base - def self.available_strategies - DiscoursePluginRegistry.summarization_strategies + class << self + def available_strategies + DiscoursePluginRegistry.summarization_strategies + end + + def find_strategy(strategy_model) + available_strategies.detect { |s| s.model == strategy_model } + end + + def selected_strategy + return if SiteSetting.summarization_strategy.blank? + + find_strategy(SiteSetting.summarization_strategy) + end end - def self.find_strategy(strategy_model) - available_strategies.detect { |s| s.model == strategy_model } - end - - def self.selected_strategy - return if SiteSetting.summarization_strategy.blank? - - find_strategy(SiteSetting.summarization_strategy) - end - - def initialize(model) - @model = model - end - - attr_reader :model - def can_request_summaries?(user) user_group_ids = user.group_ids @@ -30,20 +31,52 @@ module Summarization end end + # Some strategies could require other conditions to work correctly, + # like site settings. + # This method gets called when admins attempt to select it, + # checking if we met those conditions. def correctly_configured? raise NotImplemented end + # Strategy name to display to admins in the available strategies dropdown. def display_name raise NotImplemented end + # If we don't meet the conditions to enable this strategy, + # we'll display this hint as an error to admins. def configuration_hint raise NotImplemented end + # The idea behind this method is "give me a collection of texts, + # and I'll handle the summarization to the best of my capabilities.". + # It's important to emphasize the "collection of texts" part, which implies + # it's not tied to any model and expects the "content" to be a hash instead. + # + # @param content { Hash } - Includes the content to summarize, plus additional + # context to help the strategy produce a better result. Keys present in the content hash: + # - resource_path (optional): Helps the strategy build links to the content in the summary (e.g. "/t/-/:topic_id/POST_NUMBER") + # - content_title (optional): Provides guidance about what the content is about. + # - contents (required): Array of hashes with content to summarize (e.g. [{ poster: "asd", id: 1, text: "This is a text" }]) + # All keys are required. + # + # @returns { Hash } - The summarized content, plus chunks if the content couldn't be summarized in one pass. Example: + # { + # summary: "This is the final summary", + # chunks: [ + # { ids: [topic.first_post.post_number], summary: "this is the first chunk" }, + # { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" }, + # ], + # } def summarize(content) raise NotImplemented end + + # Returns the string we'll store in the selected strategy site setting. + def model + raise NotImplemented + end end end diff --git a/plugins/chat/app/controllers/chat/api/summaries_controller.rb b/plugins/chat/app/controllers/chat/api/summaries_controller.rb index 209aac46032..2070b6128bc 100644 --- a/plugins/chat/app/controllers/chat/api/summaries_controller.rb +++ b/plugins/chat/app/controllers/chat/api/summaries_controller.rb @@ -27,7 +27,7 @@ class Chat::Api::SummariesController < Chat::ApiController .map { "#{_1}: #{_2}" } .join("\n") - render json: { summary: strategy.summarize(content) } + render json: { summary: strategy.summarize(content).dig(:summary) } end end end diff --git a/plugins/chat/spec/system/chat_summarization_spec.rb b/plugins/chat/spec/system/chat_summarization_spec.rb index 38af780b02b..df504058e7d 100644 --- a/plugins/chat/spec/system/chat_summarization_spec.rb +++ b/plugins/chat/spec/system/chat_summarization_spec.rb @@ -11,10 +11,12 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t let(:chat) { PageObjects::Pages::Chat.new } + let(:summarization_result) { { summary: "This is a summary", chunks: [] } } + before do group.add(current_user) - strategy = DummyCustomSummarization.new("dummy") + strategy = DummyCustomSummarization.new(summarization_result) plugin.register_summarization_strategy(strategy) SiteSetting.summarization_strategy = strategy.model SiteSetting.custom_summarization_allowed_groups = group.id.to_s @@ -36,6 +38,6 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t find(".summarization-since").click find(".select-kit-row[data-value=\"3\"]").click - expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE) + expect(find(".summary-area").text).to eq(summarization_result[:summary]) end end diff --git a/spec/lib/summarization/base_spec.rb b/spec/lib/summarization/base_spec.rb index 6b211d6f95a..9e3809eb37e 100644 --- a/spec/lib/summarization/base_spec.rb +++ b/spec/lib/summarization/base_spec.rb @@ -10,13 +10,13 @@ describe Summarization::Base do it "returns true if the user group is present in the custom_summarization_allowed_groups_map setting" do SiteSetting.custom_summarization_allowed_groups = group.id - expect(described_class.new(nil).can_request_summaries?(user)).to eq(true) + expect(subject.can_request_summaries?(user)).to eq(true) end it "returns false if the user group is not present in the custom_summarization_allowed_groups_map setting" do SiteSetting.custom_summarization_allowed_groups = "" - expect(described_class.new(nil).can_request_summaries?(user)).to eq(false) + expect(subject.can_request_summaries?(user)).to eq(false) end end end diff --git a/spec/services/topic_summarization_spec.rb b/spec/services/topic_summarization_spec.rb new file mode 100644 index 00000000000..7e5f553a94c --- /dev/null +++ b/spec/services/topic_summarization_spec.rb @@ -0,0 +1,129 @@ +# frozen_string_literal: true + +describe TopicSummarization do + fab!(:topic) { Fabricate(:topic) } + fab!(:post_1) { Fabricate(:post, topic: topic) } + fab!(:post_2) { Fabricate(:post, topic: topic) } + + shared_examples "includes only public-visible topics" do + subject { described_class.new(DummyCustomSummarization.new({})) } + + it "only includes visible posts" do + topic.first_post.update!(hidden: true) + + posts = subject.summary_targets(topic) + + expect(posts.none?(&:hidden?)).to eq(true) + end + + it "doesn't include posts without users" do + topic.first_post.user.destroy! + + posts = subject.summary_targets(topic) + + expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil + end + + it "doesn't include deleted posts" do + topic.first_post.update!(user_id: nil) + + posts = subject.summary_targets(topic) + + expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil + end + end + + describe "#summary_targets" do + context "when the topic has a best replies summary" do + before { topic.has_summary = true } + + it_behaves_like "includes only public-visible topics" + end + + context "when the topic doesn't have a best replies summary" do + before { topic.has_summary = false } + + it_behaves_like "includes only public-visible topics" + end + end + + describe "#summarize" do + let(:strategy) { DummyCustomSummarization.new(summary) } + + subject { described_class.new(strategy) } + + def assert_summary_is_cached(topic, summary_response) + cached_summary = SummarySection.find_by(target: topic, meta_section_id: nil) + + expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number)) + expect(cached_summary.summarized_text).to eq(summary_response[:summary]) + expect(cached_summary.original_content_sha).to eq( + Digest::SHA256.hexdigest(topic.posts.map(&:post_number).join), + ) + expect(cached_summary.algorithm).to eq(strategy.model) + end + + def assert_chunk_is_cached(topic, chunk_response) + cached_chunk = + SummarySection + .where.not(meta_section_id: nil) + .find_by( + target: topic, + content_range: (chunk_response[:ids].min..chunk_response[:ids].max), + ) + + expect(cached_chunk.summarized_text).to eq(chunk_response[:summary]) + expect(cached_chunk.original_content_sha).to eq( + Digest::SHA256.hexdigest(chunk_response[:ids].join), + ) + expect(cached_chunk.algorithm).to eq(strategy.model) + end + + context "when the content was summarized in a single chunk" do + let(:summary) { { summary: "This is the final summary", chunks: [] } } + + it "caches the summary" do + summarized_text = subject.summarize(topic) + + expect(summarized_text).to eq(summary[:summary]) + + assert_summary_is_cached(topic, summary) + end + + it "returns the cached version in subsequent calls" do + subject.summarize(topic) + + cached_summary_text = "This is a cached summary" + cached_summary = + SummarySection.find_by(target: topic, meta_section_id: nil).update!( + summarized_text: cached_summary_text, + ) + + summarized_text = subject.summarize(topic) + expect(summarized_text).to eq(cached_summary_text) + end + end + + context "when the content was summarized in multiple chunks" do + let(:summary) do + { + summary: "This is the final summary", + chunks: [ + { ids: [topic.first_post.post_number], summary: "this is the first chunk" }, + { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" }, + ], + } + end + + it "caches the summary and each chunk" do + summarized_text = subject.summarize(topic) + + expect(summarized_text).to eq(summary[:summary]) + + assert_summary_is_cached(topic, summary) + + summary[:chunks].each { |c| assert_chunk_is_cached(topic, c) } + end + end + end +end diff --git a/spec/support/dummy_custom_summarization.rb b/spec/support/dummy_custom_summarization.rb index 05d4593113d..1724a0274f6 100644 --- a/spec/support/dummy_custom_summarization.rb +++ b/spec/support/dummy_custom_summarization.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true class DummyCustomSummarization < Summarization::Base - RESPONSE = "This is a summary of the content you gave me" + def initialize(summarization_result) + @summarization_result = summarization_result + end def display_name "dummy" @@ -15,7 +17,11 @@ class DummyCustomSummarization < Summarization::Base "hint" end + def model + "dummy" + end + def summarize(_content) - RESPONSE + @summarization_result end end diff --git a/spec/system/topic_summarization_spec.rb b/spec/system/topic_summarization_spec.rb index d54bc8444d6..e3fd5c54b41 100644 --- a/spec/system/topic_summarization_spec.rb +++ b/spec/system/topic_summarization_spec.rb @@ -10,9 +10,11 @@ RSpec.describe "Topic summarization", type: :system, js: true do let(:plugin) { Plugin::Instance.new } + let(:summarization_result) { { summary: "This is a summary", chunks: [] } } + before do sign_in(user) - strategy = DummyCustomSummarization.new("dummy") + strategy = DummyCustomSummarization.new(summarization_result) plugin.register_summarization_strategy(strategy) SiteSetting.summarization_strategy = strategy.model end @@ -24,6 +26,6 @@ RSpec.describe "Topic summarization", type: :system, js: true do expect(page.has_css?(".topic-summary-modal", wait: 5)).to eq(true) - expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE) + expect(find(".summary-area").text).to eq(summarization_result[:summary]) end end