DEV: Cache summarization strategy results. (#22230)

Updates the interface for implementing summarization strategies and adds a cache layer to summarize topics once.

The cache stores the final summary and each chunk used to build it, which will be useful when we have to extend or rebuild it.
This commit is contained in:
Roman Rizzi 2023-06-27 11:44:34 -03:00 committed by GitHub
parent a909dffe8f
commit f4e7a80600
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 332 additions and 41 deletions

View File

@ -4,6 +4,7 @@ import { ajax } from "discourse/lib/ajax";
import { popupAjaxError } from "discourse/lib/ajax-error"; import { popupAjaxError } from "discourse/lib/ajax-error";
import { action } from "@ember/object"; import { action } from "@ember/object";
import { schedule } from "@ember/runloop"; import { schedule } from "@ember/runloop";
import { cookAsync } from "discourse/lib/text";
export default class TopicSummary extends Component { export default class TopicSummary extends Component {
@tracked loading = false; @tracked loading = false;
@ -16,7 +17,9 @@ export default class TopicSummary extends Component {
ajax(`/t/${this.args.topicId}/strategy-summary`) ajax(`/t/${this.args.topicId}/strategy-summary`)
.then((data) => { .then((data) => {
this.summary = data.summary; cookAsync(data.summary).then((cooked) => {
this.summary = cooked;
});
}) })
.catch(popupAjaxError) .catch(popupAjaxError)
.finally(() => (this.loading = false)); .finally(() => (this.loading = false));

View File

@ -1178,18 +1178,7 @@ class TopicsController < ApplicationController
RateLimiter.new(current_user, "summary", 6, 5.minutes).performed! RateLimiter.new(current_user, "summary", 6, 5.minutes).performed!
hijack do hijack { render json: { summary: TopicSummarization.new(strategy).summarize(topic) } }
summary_opts = {
filter: "summary",
exclude_deleted_users: true,
exclude_hidden: true,
show_deleted: false,
}
content = TopicView.new(topic, current_user, summary_opts).posts.pluck(:raw).join("\n")
render json: { summary: strategy.summarize(content) }
end
end end
private private

View File

@ -437,7 +437,7 @@ class Post < ActiveRecord::Base
# percent rank has tons of ties # percent rank has tons of ties
where(topic_id: topic_id).where( where(topic_id: topic_id).where(
[ [
"id = ANY( "posts.id = ANY(
( (
SELECT posts.id SELECT posts.id
FROM posts FROM posts

View File

@ -0,0 +1,21 @@
# frozen_string_literal: true
class SummarySection < ActiveRecord::Base
belongs_to :target, polymorphic: true
end
# == Schema Information
#
# Table name: summary_sections
#
# id :bigint not null, primary key
# target_id :integer not null
# target_type :string not null
# content_range :int4range
# summarized_text :string not null
# meta_section_id :integer
# original_content_sha :string not null
# algorithm :string not null
# created_at :datetime not null
# updated_at :datetime not null
#

View File

@ -0,0 +1,90 @@
# frozen_string_literal: true
class TopicSummarization
def initialize(strategy)
@strategy = strategy
end
def summarize(topic)
DistributedMutex.synchronize("toppic_summarization_#{topic.id}") do
existing_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
return existing_summary.summarized_text if existing_summary
content = {
resource_path: "#{Discourse.base_path}/t/-/#{topic.id}",
content_title: topic.title,
contents: [],
}
targets_data = summary_targets(topic).pluck(:post_number, :raw, :username)
targets_data.map do |(pn, raw, username)|
content[:contents] << { poster: username, id: pn, text: raw }
end
summarization_result = strategy.summarize(content)
cached_summary(summarization_result, targets_data.map(&:first), topic)
summarization_result[:summary]
end
end
def summary_targets(topic)
topic.has_summary? ? best_replies(topic) : pick_selection(topic)
end
private
attr_reader :strategy
def best_replies(topic)
Post
.summary(topic.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.joins(:user)
.order(:post_number)
end
def pick_selection(topic)
posts =
Post
.where(topic_id: topic.id)
.where("post_type = ?", Post.types[:regular])
.where("NOT hidden")
.order(:post_number)
post_numbers = posts.limit(5).pluck(:post_number)
post_numbers += posts.reorder("posts.score desc").limit(50).pluck(:post_number)
post_numbers += posts.reorder("post_number desc").limit(5).pluck(:post_number)
Post
.where(topic_id: topic.id)
.joins(:user)
.where("post_number in (?)", post_numbers)
.order(:post_number)
end
def cached_summary(result, post_numbers, topic)
main_summary =
SummarySection.create!(
target: topic,
algorithm: strategy.model,
content_range: (post_numbers.first..post_numbers.last),
summarized_text: result[:summary],
original_content_sha: Digest::SHA256.hexdigest(post_numbers.join),
)
result[:chunks].each do |chunk|
SummarySection.create!(
target: topic,
algorithm: strategy.model,
content_range: chunk[:ids].min..chunk[:ids].max,
summarized_text: chunk[:summary],
original_content_sha: Digest::SHA256.hexdigest(chunk[:ids].join),
meta_section_id: main_summary.id,
)
end
end
end

View File

@ -0,0 +1,16 @@
# frozen_string_literal: true
class CreateSummarySectionsTable < ActiveRecord::Migration[7.0]
def change
create_table :summary_sections do |t|
t.integer :target_id, null: false
t.string :target_type, null: false
t.int4range :content_range
t.string :summarized_text, null: false
t.integer :meta_section_id
t.string :original_content_sha, null: false
t.string :algorithm, null: false
t.timestamps
end
end
end

View File

@ -1,27 +1,28 @@
# frozen_string_literal: true # frozen_string_literal: true
# Base class that defines the interface that every summarization
# strategy must implement.
# Above each method, you'll find an explanation of what
# it does and what it should return.
module Summarization module Summarization
class Base class Base
def self.available_strategies class << self
def available_strategies
DiscoursePluginRegistry.summarization_strategies DiscoursePluginRegistry.summarization_strategies
end end
def self.find_strategy(strategy_model) def find_strategy(strategy_model)
available_strategies.detect { |s| s.model == strategy_model } available_strategies.detect { |s| s.model == strategy_model }
end end
def self.selected_strategy def selected_strategy
return if SiteSetting.summarization_strategy.blank? return if SiteSetting.summarization_strategy.blank?
find_strategy(SiteSetting.summarization_strategy) find_strategy(SiteSetting.summarization_strategy)
end end
def initialize(model)
@model = model
end end
attr_reader :model
def can_request_summaries?(user) def can_request_summaries?(user)
user_group_ids = user.group_ids user_group_ids = user.group_ids
@ -30,20 +31,52 @@ module Summarization
end end
end end
# Some strategies could require other conditions to work correctly,
# like site settings.
# This method gets called when admins attempt to select it,
# checking if we met those conditions.
def correctly_configured? def correctly_configured?
raise NotImplemented raise NotImplemented
end end
# Strategy name to display to admins in the available strategies dropdown.
def display_name def display_name
raise NotImplemented raise NotImplemented
end end
# If we don't meet the conditions to enable this strategy,
# we'll display this hint as an error to admins.
def configuration_hint def configuration_hint
raise NotImplemented raise NotImplemented
end end
# The idea behind this method is "give me a collection of texts,
# and I'll handle the summarization to the best of my capabilities.".
# It's important to emphasize the "collection of texts" part, which implies
# it's not tied to any model and expects the "content" to be a hash instead.
#
# @param content { Hash } - Includes the content to summarize, plus additional
# context to help the strategy produce a better result. Keys present in the content hash:
# - resource_path (optional): Helps the strategy build links to the content in the summary (e.g. "/t/-/:topic_id/POST_NUMBER")
# - content_title (optional): Provides guidance about what the content is about.
# - contents (required): Array of hashes with content to summarize (e.g. [{ poster: "asd", id: 1, text: "This is a text" }])
# All keys are required.
#
# @returns { Hash } - The summarized content, plus chunks if the content couldn't be summarized in one pass. Example:
# {
# summary: "This is the final summary",
# chunks: [
# { ids: [topic.first_post.post_number], summary: "this is the first chunk" },
# { ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" },
# ],
# }
def summarize(content) def summarize(content)
raise NotImplemented raise NotImplemented
end end
# Returns the string we'll store in the selected strategy site setting.
def model
raise NotImplemented
end
end end
end end

View File

@ -27,7 +27,7 @@ class Chat::Api::SummariesController < Chat::ApiController
.map { "#{_1}: #{_2}" } .map { "#{_1}: #{_2}" }
.join("\n") .join("\n")
render json: { summary: strategy.summarize(content) } render json: { summary: strategy.summarize(content).dig(:summary) }
end end
end end
end end

View File

@ -11,10 +11,12 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t
let(:chat) { PageObjects::Pages::Chat.new } let(:chat) { PageObjects::Pages::Chat.new }
let(:summarization_result) { { summary: "This is a summary", chunks: [] } }
before do before do
group.add(current_user) group.add(current_user)
strategy = DummyCustomSummarization.new("dummy") strategy = DummyCustomSummarization.new(summarization_result)
plugin.register_summarization_strategy(strategy) plugin.register_summarization_strategy(strategy)
SiteSetting.summarization_strategy = strategy.model SiteSetting.summarization_strategy = strategy.model
SiteSetting.custom_summarization_allowed_groups = group.id.to_s SiteSetting.custom_summarization_allowed_groups = group.id.to_s
@ -36,6 +38,6 @@ RSpec.describe "Summarize a channel since your last visit", type: :system, js: t
find(".summarization-since").click find(".summarization-since").click
find(".select-kit-row[data-value=\"3\"]").click find(".select-kit-row[data-value=\"3\"]").click
expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE) expect(find(".summary-area").text).to eq(summarization_result[:summary])
end end
end end

View File

@ -10,13 +10,13 @@ describe Summarization::Base do
it "returns true if the user group is present in the custom_summarization_allowed_groups_map setting" do it "returns true if the user group is present in the custom_summarization_allowed_groups_map setting" do
SiteSetting.custom_summarization_allowed_groups = group.id SiteSetting.custom_summarization_allowed_groups = group.id
expect(described_class.new(nil).can_request_summaries?(user)).to eq(true) expect(subject.can_request_summaries?(user)).to eq(true)
end end
it "returns false if the user group is not present in the custom_summarization_allowed_groups_map setting" do it "returns false if the user group is not present in the custom_summarization_allowed_groups_map setting" do
SiteSetting.custom_summarization_allowed_groups = "" SiteSetting.custom_summarization_allowed_groups = ""
expect(described_class.new(nil).can_request_summaries?(user)).to eq(false) expect(subject.can_request_summaries?(user)).to eq(false)
end end
end end
end end

View File

@ -0,0 +1,129 @@
# frozen_string_literal: true
describe TopicSummarization do
fab!(:topic) { Fabricate(:topic) }
fab!(:post_1) { Fabricate(:post, topic: topic) }
fab!(:post_2) { Fabricate(:post, topic: topic) }
shared_examples "includes only public-visible topics" do
subject { described_class.new(DummyCustomSummarization.new({})) }
it "only includes visible posts" do
topic.first_post.update!(hidden: true)
posts = subject.summary_targets(topic)
expect(posts.none?(&:hidden?)).to eq(true)
end
it "doesn't include posts without users" do
topic.first_post.user.destroy!
posts = subject.summary_targets(topic)
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end
it "doesn't include deleted posts" do
topic.first_post.update!(user_id: nil)
posts = subject.summary_targets(topic)
expect(posts.detect { |p| p.id == topic.first_post.id }).to be_nil
end
end
describe "#summary_targets" do
context "when the topic has a best replies summary" do
before { topic.has_summary = true }
it_behaves_like "includes only public-visible topics"
end
context "when the topic doesn't have a best replies summary" do
before { topic.has_summary = false }
it_behaves_like "includes only public-visible topics"
end
end
describe "#summarize" do
let(:strategy) { DummyCustomSummarization.new(summary) }
subject { described_class.new(strategy) }
def assert_summary_is_cached(topic, summary_response)
cached_summary = SummarySection.find_by(target: topic, meta_section_id: nil)
expect(cached_summary.content_range).to cover(*topic.posts.map(&:post_number))
expect(cached_summary.summarized_text).to eq(summary_response[:summary])
expect(cached_summary.original_content_sha).to eq(
Digest::SHA256.hexdigest(topic.posts.map(&:post_number).join),
)
expect(cached_summary.algorithm).to eq(strategy.model)
end
def assert_chunk_is_cached(topic, chunk_response)
cached_chunk =
SummarySection
.where.not(meta_section_id: nil)
.find_by(
target: topic,
content_range: (chunk_response[:ids].min..chunk_response[:ids].max),
)
expect(cached_chunk.summarized_text).to eq(chunk_response[:summary])
expect(cached_chunk.original_content_sha).to eq(
Digest::SHA256.hexdigest(chunk_response[:ids].join),
)
expect(cached_chunk.algorithm).to eq(strategy.model)
end
context "when the content was summarized in a single chunk" do
let(:summary) { { summary: "This is the final summary", chunks: [] } }
it "caches the summary" do
summarized_text = subject.summarize(topic)
expect(summarized_text).to eq(summary[:summary])
assert_summary_is_cached(topic, summary)
end
it "returns the cached version in subsequent calls" do
subject.summarize(topic)
cached_summary_text = "This is a cached summary"
cached_summary =
SummarySection.find_by(target: topic, meta_section_id: nil).update!(
summarized_text: cached_summary_text,
)
summarized_text = subject.summarize(topic)
expect(summarized_text).to eq(cached_summary_text)
end
end
context "when the content was summarized in multiple chunks" do
let(:summary) do
{
summary: "This is the final summary",
chunks: [
{ ids: [topic.first_post.post_number], summary: "this is the first chunk" },
{ ids: [post_1.post_number, post_2.post_number], summary: "this is the second chunk" },
],
}
end
it "caches the summary and each chunk" do
summarized_text = subject.summarize(topic)
expect(summarized_text).to eq(summary[:summary])
assert_summary_is_cached(topic, summary)
summary[:chunks].each { |c| assert_chunk_is_cached(topic, c) }
end
end
end
end

View File

@ -1,7 +1,9 @@
# frozen_string_literal: true # frozen_string_literal: true
class DummyCustomSummarization < Summarization::Base class DummyCustomSummarization < Summarization::Base
RESPONSE = "This is a summary of the content you gave me" def initialize(summarization_result)
@summarization_result = summarization_result
end
def display_name def display_name
"dummy" "dummy"
@ -15,7 +17,11 @@ class DummyCustomSummarization < Summarization::Base
"hint" "hint"
end end
def model
"dummy"
end
def summarize(_content) def summarize(_content)
RESPONSE @summarization_result
end end
end end

View File

@ -10,9 +10,11 @@ RSpec.describe "Topic summarization", type: :system, js: true do
let(:plugin) { Plugin::Instance.new } let(:plugin) { Plugin::Instance.new }
let(:summarization_result) { { summary: "This is a summary", chunks: [] } }
before do before do
sign_in(user) sign_in(user)
strategy = DummyCustomSummarization.new("dummy") strategy = DummyCustomSummarization.new(summarization_result)
plugin.register_summarization_strategy(strategy) plugin.register_summarization_strategy(strategy)
SiteSetting.summarization_strategy = strategy.model SiteSetting.summarization_strategy = strategy.model
end end
@ -24,6 +26,6 @@ RSpec.describe "Topic summarization", type: :system, js: true do
expect(page.has_css?(".topic-summary-modal", wait: 5)).to eq(true) expect(page.has_css?(".topic-summary-modal", wait: 5)).to eq(true)
expect(find(".summary-area").text).to eq(DummyCustomSummarization::RESPONSE) expect(find(".summary-area").text).to eq(summarization_result[:summary])
end end
end end