[MM-55052] Improve file extraction logging (#25036)

This commit is contained in:
Ben Schumacher 2023-10-23 20:25:40 +02:00 committed by GitHub
parent aa82597e2d
commit e864f9cfee
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 74 additions and 26 deletions

View File

@ -566,7 +566,7 @@ type AppIface interface {
ExportFileExists(path string) (bool, *model.AppError)
ExportFileModTime(path string) (time.Time, *model.AppError)
ExportPermissions(w io.Writer) error
ExtractContentFromFileInfo(fileInfo *model.FileInfo) error
ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error
FetchSamlMetadataFromIdp(url string) ([]byte, *model.AppError)
FileBackend() filestore.FileBackend
FileExists(path string) (bool, *model.AppError)

View File

@ -757,7 +757,7 @@ func (a *App) UploadFileX(c *request.Context, channelID, name string, input io.R
if written > t.maxFileSize {
if fileErr := a.RemoveFile(t.fileinfo.Path); fileErr != nil {
mlog.Error("Failed to remove file", mlog.Err(fileErr))
c.Logger().Error("Failed to remove file", mlog.Err(fileErr))
}
return nil, t.newAppError("api.file.upload_file.too_large_detailed.app_error", http.StatusRequestEntityTooLarge, "Length", t.ContentLength, "Limit", t.maxFileSize)
}
@ -796,10 +796,11 @@ func (a *App) UploadFileX(c *request.Context, channelID, name string, input io.R
if *a.Config().FileSettings.ExtractContent {
infoCopy := *t.fileinfo
crctx := c.Clone()
a.Srv().GoBuffered(func() {
err := a.ExtractContentFromFileInfo(&infoCopy)
err := a.ExtractContentFromFileInfo(crctx, &infoCopy)
if err != nil {
mlog.Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
crctx.Logger().Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
}
})
}
@ -1047,10 +1048,11 @@ func (a *App) DoUploadFileExpectModification(c request.CTX, now time.Time, rawTe
if *a.Config().FileSettings.ExtractContent {
infoCopy := *info
crctx := c.Clone()
a.Srv().GoBuffered(func() {
err := a.ExtractContentFromFileInfo(&infoCopy)
err := a.ExtractContentFromFileInfo(crctx, &infoCopy)
if err != nil {
mlog.Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
crctx.Logger().Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
}
})
}
@ -1442,7 +1444,7 @@ func (a *App) SearchFilesInTeamForUser(c *request.Context, terms string, userId
return fileInfoSearchResults, a.filterInaccessibleFiles(fileInfoSearchResults, filterFileOptions{assumeSortedCreatedAt: true})
}
func (a *App) ExtractContentFromFileInfo(fileInfo *model.FileInfo) error {
func (a *App) ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error {
// We don't process images.
if fileInfo.IsImage() {
return nil
@ -1453,7 +1455,7 @@ func (a *App) ExtractContentFromFileInfo(fileInfo *model.FileInfo) error {
return errors.Wrap(aerr, "failed to open file for extract file content")
}
defer file.Close()
text, err := docextractor.Extract(fileInfo.Name, file, docextractor.ExtractSettings{
text, err := docextractor.Extract(rctx.Logger(), fileInfo.Name, file, docextractor.ExtractSettings{
ArchiveRecursion: *a.Config().FileSettings.ArchiveRecursion,
})
if err != nil {

View File

@ -17,6 +17,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/request"
"github.com/mattermost/mattermost/server/v8/channels/store"
storemocks "github.com/mattermost/mattermost/server/v8/channels/store/storetest/mocks"
"github.com/mattermost/mattermost/server/v8/channels/utils/fileutils"
@ -555,7 +556,7 @@ func TestExtractContentFromFileInfo(t *testing.T) {
}
// Test that we don't process images.
require.NoError(t, app.ExtractContentFromFileInfo(fi))
require.NoError(t, app.ExtractContentFromFileInfo(request.TestContext(t), fi))
}
func TestGetLastAccessibleFileTime(t *testing.T) {

View File

@ -4248,7 +4248,7 @@ func (a *OpenTracingAppLayer) ExtendSessionExpiryIfNeeded(session *model.Session
return resultVar0
}
func (a *OpenTracingAppLayer) ExtractContentFromFileInfo(fileInfo *model.FileInfo) error {
func (a *OpenTracingAppLayer) ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error {
origCtx := a.ctx
span, newCtx := tracing.StartSpanWithParentByContext(a.ctx, "app.ExtractContentFromFileInfo")
@ -4260,7 +4260,7 @@ func (a *OpenTracingAppLayer) ExtractContentFromFileInfo(fileInfo *model.FileInf
}()
defer span.Finish()
resultVar0 := a.app.ExtractContentFromFileInfo(fileInfo)
resultVar0 := a.app.ExtractContentFromFileInfo(rctx, fileInfo)
if resultVar0 != nil {
span.LogFields(spanlog.Error(resultVar0))

View File

@ -318,17 +318,18 @@ func (a *App) UploadData(c request.CTX, us *model.UploadSession, rd io.Reader) (
if *a.Config().FileSettings.ExtractContent {
infoCopy := *info
crctx := c.Clone()
a.Srv().Go(func() {
err := a.ExtractContentFromFileInfo(&infoCopy)
err := a.ExtractContentFromFileInfo(crctx, &infoCopy)
if err != nil {
mlog.Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
crctx.Logger().Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
}
})
}
// delete upload session
if storeErr := a.Srv().Store().UploadSession().Delete(us.Id); storeErr != nil {
mlog.Warn("Failed to delete UploadSession", mlog.Err(storeErr))
c.Logger().Warn("Failed to delete UploadSession", mlog.Err(storeErr))
}
return info, nil

View File

@ -8,6 +8,7 @@ import (
"github.com/mattermost/mattermost/server/public/model"
"github.com/mattermost/mattermost/server/public/shared/mlog"
"github.com/mattermost/mattermost/server/public/shared/request"
"github.com/mattermost/mattermost/server/v8/channels/jobs"
"github.com/mattermost/mattermost/server/v8/channels/store"
)
@ -20,7 +21,7 @@ var ignoredFiles = map[string]bool{
}
type AppIface interface {
ExtractContentFromFileInfo(fileInfo *model.FileInfo) error
ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error
}
func MakeWorker(jobServer *jobs.JobServer, app AppIface, store store.Store) *jobs.SimpleWorker {
@ -66,7 +67,8 @@ func MakeWorker(jobServer *jobs.JobServer, app AppIface, store store.Store) *job
for _, fileInfo := range fileInfos {
if !ignoredFiles[fileInfo.Extension] {
logger.Debug("Extracting file", mlog.String("filename", fileInfo.Name), mlog.String("filepath", fileInfo.Path))
err = app.ExtractContentFromFileInfo(fileInfo)
err = app.ExtractContentFromFileInfo(request.EmptyContext(logger), fileInfo)
if err != nil {
logger.Warn("Failed to extract file content", mlog.Err(err), mlog.String("file_info_id", fileInfo.Id))
nErrs++

View File

@ -18,6 +18,10 @@ type archiveExtractor struct {
SubExtractor Extractor
}
func (ae *archiveExtractor) Name() string {
return "archiveExtractor"
}
func (ae *archiveExtractor) Match(filename string) bool {
_, err := archiver.ByExtension(filename)
return err == nil

View File

@ -10,9 +10,14 @@ import (
)
type combineExtractor struct {
logger mlog.LoggerIFace
SubExtractors []Extractor
}
func (ce *combineExtractor) Name() string {
return "combineExtractor"
}
func (ce *combineExtractor) Add(extractor Extractor) {
ce.SubExtractors = append(ce.SubExtractors, extractor)
}
@ -32,7 +37,7 @@ func (ce *combineExtractor) Extract(filename string, r io.ReadSeeker) (string, e
r.Seek(0, io.SeekStart)
text, err := extractor.Extract(filename, r)
if err != nil {
mlog.Warn("unable to extract file content", mlog.Err(err))
ce.logger.Warn("Unable to extract file content", mlog.String("file_name", filename), mlog.String("extractor", extractor.Name()), mlog.Err(err))
continue
}
return text, nil

View File

@ -5,6 +5,8 @@ package docextractor
import (
"io"
"github.com/mattermost/mattermost/server/public/shared/mlog"
)
// ExtractSettings defines the features enabled/disable during the document text extraction.
@ -15,13 +17,15 @@ type ExtractSettings struct {
}
// Extract extract the text from a document using the system default extractors
func Extract(filename string, r io.ReadSeeker, settings ExtractSettings) (string, error) {
return ExtractWithExtraExtractors(filename, r, settings, []Extractor{})
func Extract(logger mlog.LoggerIFace, filename string, r io.ReadSeeker, settings ExtractSettings) (string, error) {
return ExtractWithExtraExtractors(logger, filename, r, settings, []Extractor{})
}
// ExtractWithExtraExtractors extract the text from a document using the provided extractors beside the system default extractors.
func ExtractWithExtraExtractors(filename string, r io.ReadSeeker, settings ExtractSettings, extraExtractors []Extractor) (string, error) {
enabledExtractors := &combineExtractor{}
func ExtractWithExtraExtractors(logger mlog.LoggerIFace, filename string, r io.ReadSeeker, settings ExtractSettings, extraExtractors []Extractor) (string, error) {
enabledExtractors := &combineExtractor{
logger: logger,
}
for _, extraExtractor := range extraExtractors {
enabledExtractors.Add(extraExtractor)
}

View File

@ -13,10 +13,13 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/mattermost/mattermost/server/public/shared/mlog"
"github.com/mattermost/mattermost/server/v8/channels/utils/testutils"
)
func TestExtract(t *testing.T) {
logger := mlog.CreateConsoleTestLogger(t)
testCases := []struct {
Name string
TestFileName string
@ -127,7 +130,7 @@ func TestExtract(t *testing.T) {
t.Run(tc.Name, func(t *testing.T) {
data, err := testutils.ReadTestFile(tc.TestFileName)
require.NoError(t, err)
text, err := Extract(tc.TestFileName, bytes.NewReader(data), tc.Settings)
text, err := Extract(logger, tc.TestFileName, bytes.NewReader(data), tc.Settings)
if tc.ExpectError {
require.Error(t, err)
} else {
@ -145,7 +148,7 @@ func TestExtract(t *testing.T) {
t.Run("Unsupported binary file", func(t *testing.T) {
data, err := testutils.ReadTestFile("testjpg.jpg")
require.NoError(t, err)
text, err := Extract("testjpg.jpg", bytes.NewReader(data), ExtractSettings{})
text, err := Extract(logger, "testjpg.jpg", bytes.NewReader(data), ExtractSettings{})
require.NoError(t, err)
require.Equal(t, "", text)
})
@ -153,7 +156,7 @@ func TestExtract(t *testing.T) {
t.Run("Wrong docx extension", func(t *testing.T) {
data, err := testutils.ReadTestFile("sample-doc.pdf")
require.NoError(t, err)
text, err := Extract("sample-doc.docx", bytes.NewReader(data), ExtractSettings{})
text, err := Extract(logger, "sample-doc.docx", bytes.NewReader(data), ExtractSettings{})
require.NoError(t, err)
require.Equal(t, "", text)
})
@ -161,6 +164,10 @@ func TestExtract(t *testing.T) {
type customTestPdfExtractor struct{}
func (te *customTestPdfExtractor) Name() string {
return "customTestPdfExtractor"
}
func (te *customTestPdfExtractor) Match(filename string) bool {
return strings.HasSuffix(filename, ".pdf")
}
@ -171,6 +178,10 @@ func (te *customTestPdfExtractor) Extract(filename string, r io.ReadSeeker) (str
type failingExtractor struct{}
func (te *failingExtractor) Name() string {
return "failingExtractor"
}
func (te *failingExtractor) Match(filename string) bool {
return true
}
@ -180,11 +191,12 @@ func (te *failingExtractor) Extract(filename string, r io.ReadSeeker) (string, e
}
func TestExtractWithExtraExtractors(t *testing.T) {
logger := mlog.CreateConsoleTestLogger(t)
t.Run("override existing extractor", func(t *testing.T) {
data, err := testutils.ReadTestFile("sample-doc.pdf")
require.NoError(t, err)
text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}})
text, err := ExtractWithExtraExtractors(logger, "sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}})
require.NoError(t, err)
require.Equal(t, text, "this is a text generated content")
})
@ -193,7 +205,7 @@ func TestExtractWithExtraExtractors(t *testing.T) {
data, err := testutils.ReadTestFile("sample-doc.pdf")
require.NoError(t, err)
text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}})
text, err := ExtractWithExtraExtractors(logger, "sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}})
require.NoError(t, err)
assert.Contains(t, text, "simple")
assert.Contains(t, text, "document")

View File

@ -26,6 +26,10 @@ var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string
"pdf": docconv.ConvertPDF,
}
func (de *documentExtractor) Name() string {
return "documentExtractor"
}
func (de *documentExtractor) Match(filename string) bool {
extension := strings.TrimPrefix(path.Ext(filename), ".")
_, ok := doconvConverterByExtensions[extension]

View File

@ -11,4 +11,5 @@ import (
type Extractor interface {
Match(filename string) bool
Extract(filename string, file io.ReadSeeker) (string, error)
Name() string
}

View File

@ -36,6 +36,10 @@ func newMMPreviewExtractor(url string, secret string, pdfExtractor pdfExtractor)
return &mmPreviewExtractor{url: url, secret: secret, pdfExtractor: pdfExtractor}
}
func (mpe *mmPreviewExtractor) Name() string {
return "mmPreviewExtractor"
}
func (mpe *mmPreviewExtractor) Match(filename string) bool {
extension := strings.TrimPrefix(path.Ext(filename), ".")
return mmpreviewSupportedExtensions[extension]

View File

@ -17,6 +17,10 @@ import (
type pdfExtractor struct{}
func (pe *pdfExtractor) Name() string {
return "pdfExtractor"
}
func (pe *pdfExtractor) Match(filename string) bool {
supportedExtensions := map[string]bool{
"pdf": true,

View File

@ -11,6 +11,10 @@ import (
type plainExtractor struct{}
func (pe *plainExtractor) Name() string {
return "plainExtractor"
}
func (pe *plainExtractor) Match(filename string) bool {
return true
}