mirror of
https://github.com/mattermost/mattermost.git
synced 2025-02-25 18:55:24 -06:00
[MM-55052] Improve file extraction logging (#25036)
This commit is contained in:
parent
aa82597e2d
commit
e864f9cfee
@ -566,7 +566,7 @@ type AppIface interface {
|
||||
ExportFileExists(path string) (bool, *model.AppError)
|
||||
ExportFileModTime(path string) (time.Time, *model.AppError)
|
||||
ExportPermissions(w io.Writer) error
|
||||
ExtractContentFromFileInfo(fileInfo *model.FileInfo) error
|
||||
ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error
|
||||
FetchSamlMetadataFromIdp(url string) ([]byte, *model.AppError)
|
||||
FileBackend() filestore.FileBackend
|
||||
FileExists(path string) (bool, *model.AppError)
|
||||
|
@ -757,7 +757,7 @@ func (a *App) UploadFileX(c *request.Context, channelID, name string, input io.R
|
||||
|
||||
if written > t.maxFileSize {
|
||||
if fileErr := a.RemoveFile(t.fileinfo.Path); fileErr != nil {
|
||||
mlog.Error("Failed to remove file", mlog.Err(fileErr))
|
||||
c.Logger().Error("Failed to remove file", mlog.Err(fileErr))
|
||||
}
|
||||
return nil, t.newAppError("api.file.upload_file.too_large_detailed.app_error", http.StatusRequestEntityTooLarge, "Length", t.ContentLength, "Limit", t.maxFileSize)
|
||||
}
|
||||
@ -796,10 +796,11 @@ func (a *App) UploadFileX(c *request.Context, channelID, name string, input io.R
|
||||
|
||||
if *a.Config().FileSettings.ExtractContent {
|
||||
infoCopy := *t.fileinfo
|
||||
crctx := c.Clone()
|
||||
a.Srv().GoBuffered(func() {
|
||||
err := a.ExtractContentFromFileInfo(&infoCopy)
|
||||
err := a.ExtractContentFromFileInfo(crctx, &infoCopy)
|
||||
if err != nil {
|
||||
mlog.Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
|
||||
crctx.Logger().Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -1047,10 +1048,11 @@ func (a *App) DoUploadFileExpectModification(c request.CTX, now time.Time, rawTe
|
||||
|
||||
if *a.Config().FileSettings.ExtractContent {
|
||||
infoCopy := *info
|
||||
crctx := c.Clone()
|
||||
a.Srv().GoBuffered(func() {
|
||||
err := a.ExtractContentFromFileInfo(&infoCopy)
|
||||
err := a.ExtractContentFromFileInfo(crctx, &infoCopy)
|
||||
if err != nil {
|
||||
mlog.Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
|
||||
crctx.Logger().Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
|
||||
}
|
||||
})
|
||||
}
|
||||
@ -1442,7 +1444,7 @@ func (a *App) SearchFilesInTeamForUser(c *request.Context, terms string, userId
|
||||
return fileInfoSearchResults, a.filterInaccessibleFiles(fileInfoSearchResults, filterFileOptions{assumeSortedCreatedAt: true})
|
||||
}
|
||||
|
||||
func (a *App) ExtractContentFromFileInfo(fileInfo *model.FileInfo) error {
|
||||
func (a *App) ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error {
|
||||
// We don't process images.
|
||||
if fileInfo.IsImage() {
|
||||
return nil
|
||||
@ -1453,7 +1455,7 @@ func (a *App) ExtractContentFromFileInfo(fileInfo *model.FileInfo) error {
|
||||
return errors.Wrap(aerr, "failed to open file for extract file content")
|
||||
}
|
||||
defer file.Close()
|
||||
text, err := docextractor.Extract(fileInfo.Name, file, docextractor.ExtractSettings{
|
||||
text, err := docextractor.Extract(rctx.Logger(), fileInfo.Name, file, docextractor.ExtractSettings{
|
||||
ArchiveRecursion: *a.Config().FileSettings.ArchiveRecursion,
|
||||
})
|
||||
if err != nil {
|
||||
|
@ -17,6 +17,7 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mattermost/mattermost/server/public/model"
|
||||
"github.com/mattermost/mattermost/server/public/shared/request"
|
||||
"github.com/mattermost/mattermost/server/v8/channels/store"
|
||||
storemocks "github.com/mattermost/mattermost/server/v8/channels/store/storetest/mocks"
|
||||
"github.com/mattermost/mattermost/server/v8/channels/utils/fileutils"
|
||||
@ -555,7 +556,7 @@ func TestExtractContentFromFileInfo(t *testing.T) {
|
||||
}
|
||||
|
||||
// Test that we don't process images.
|
||||
require.NoError(t, app.ExtractContentFromFileInfo(fi))
|
||||
require.NoError(t, app.ExtractContentFromFileInfo(request.TestContext(t), fi))
|
||||
}
|
||||
|
||||
func TestGetLastAccessibleFileTime(t *testing.T) {
|
||||
|
@ -4248,7 +4248,7 @@ func (a *OpenTracingAppLayer) ExtendSessionExpiryIfNeeded(session *model.Session
|
||||
return resultVar0
|
||||
}
|
||||
|
||||
func (a *OpenTracingAppLayer) ExtractContentFromFileInfo(fileInfo *model.FileInfo) error {
|
||||
func (a *OpenTracingAppLayer) ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error {
|
||||
origCtx := a.ctx
|
||||
span, newCtx := tracing.StartSpanWithParentByContext(a.ctx, "app.ExtractContentFromFileInfo")
|
||||
|
||||
@ -4260,7 +4260,7 @@ func (a *OpenTracingAppLayer) ExtractContentFromFileInfo(fileInfo *model.FileInf
|
||||
}()
|
||||
|
||||
defer span.Finish()
|
||||
resultVar0 := a.app.ExtractContentFromFileInfo(fileInfo)
|
||||
resultVar0 := a.app.ExtractContentFromFileInfo(rctx, fileInfo)
|
||||
|
||||
if resultVar0 != nil {
|
||||
span.LogFields(spanlog.Error(resultVar0))
|
||||
|
@ -318,17 +318,18 @@ func (a *App) UploadData(c request.CTX, us *model.UploadSession, rd io.Reader) (
|
||||
|
||||
if *a.Config().FileSettings.ExtractContent {
|
||||
infoCopy := *info
|
||||
crctx := c.Clone()
|
||||
a.Srv().Go(func() {
|
||||
err := a.ExtractContentFromFileInfo(&infoCopy)
|
||||
err := a.ExtractContentFromFileInfo(crctx, &infoCopy)
|
||||
if err != nil {
|
||||
mlog.Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
|
||||
crctx.Logger().Error("Failed to extract file content", mlog.Err(err), mlog.String("fileInfoId", infoCopy.Id))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// delete upload session
|
||||
if storeErr := a.Srv().Store().UploadSession().Delete(us.Id); storeErr != nil {
|
||||
mlog.Warn("Failed to delete UploadSession", mlog.Err(storeErr))
|
||||
c.Logger().Warn("Failed to delete UploadSession", mlog.Err(storeErr))
|
||||
}
|
||||
|
||||
return info, nil
|
||||
|
@ -8,6 +8,7 @@ import (
|
||||
|
||||
"github.com/mattermost/mattermost/server/public/model"
|
||||
"github.com/mattermost/mattermost/server/public/shared/mlog"
|
||||
"github.com/mattermost/mattermost/server/public/shared/request"
|
||||
"github.com/mattermost/mattermost/server/v8/channels/jobs"
|
||||
"github.com/mattermost/mattermost/server/v8/channels/store"
|
||||
)
|
||||
@ -20,7 +21,7 @@ var ignoredFiles = map[string]bool{
|
||||
}
|
||||
|
||||
type AppIface interface {
|
||||
ExtractContentFromFileInfo(fileInfo *model.FileInfo) error
|
||||
ExtractContentFromFileInfo(rctx request.CTX, fileInfo *model.FileInfo) error
|
||||
}
|
||||
|
||||
func MakeWorker(jobServer *jobs.JobServer, app AppIface, store store.Store) *jobs.SimpleWorker {
|
||||
@ -66,7 +67,8 @@ func MakeWorker(jobServer *jobs.JobServer, app AppIface, store store.Store) *job
|
||||
for _, fileInfo := range fileInfos {
|
||||
if !ignoredFiles[fileInfo.Extension] {
|
||||
logger.Debug("Extracting file", mlog.String("filename", fileInfo.Name), mlog.String("filepath", fileInfo.Path))
|
||||
err = app.ExtractContentFromFileInfo(fileInfo)
|
||||
|
||||
err = app.ExtractContentFromFileInfo(request.EmptyContext(logger), fileInfo)
|
||||
if err != nil {
|
||||
logger.Warn("Failed to extract file content", mlog.Err(err), mlog.String("file_info_id", fileInfo.Id))
|
||||
nErrs++
|
||||
|
@ -18,6 +18,10 @@ type archiveExtractor struct {
|
||||
SubExtractor Extractor
|
||||
}
|
||||
|
||||
func (ae *archiveExtractor) Name() string {
|
||||
return "archiveExtractor"
|
||||
}
|
||||
|
||||
func (ae *archiveExtractor) Match(filename string) bool {
|
||||
_, err := archiver.ByExtension(filename)
|
||||
return err == nil
|
||||
|
@ -10,9 +10,14 @@ import (
|
||||
)
|
||||
|
||||
type combineExtractor struct {
|
||||
logger mlog.LoggerIFace
|
||||
SubExtractors []Extractor
|
||||
}
|
||||
|
||||
func (ce *combineExtractor) Name() string {
|
||||
return "combineExtractor"
|
||||
}
|
||||
|
||||
func (ce *combineExtractor) Add(extractor Extractor) {
|
||||
ce.SubExtractors = append(ce.SubExtractors, extractor)
|
||||
}
|
||||
@ -32,7 +37,7 @@ func (ce *combineExtractor) Extract(filename string, r io.ReadSeeker) (string, e
|
||||
r.Seek(0, io.SeekStart)
|
||||
text, err := extractor.Extract(filename, r)
|
||||
if err != nil {
|
||||
mlog.Warn("unable to extract file content", mlog.Err(err))
|
||||
ce.logger.Warn("Unable to extract file content", mlog.String("file_name", filename), mlog.String("extractor", extractor.Name()), mlog.Err(err))
|
||||
continue
|
||||
}
|
||||
return text, nil
|
||||
|
@ -5,6 +5,8 @@ package docextractor
|
||||
|
||||
import (
|
||||
"io"
|
||||
|
||||
"github.com/mattermost/mattermost/server/public/shared/mlog"
|
||||
)
|
||||
|
||||
// ExtractSettings defines the features enabled/disable during the document text extraction.
|
||||
@ -15,13 +17,15 @@ type ExtractSettings struct {
|
||||
}
|
||||
|
||||
// Extract extract the text from a document using the system default extractors
|
||||
func Extract(filename string, r io.ReadSeeker, settings ExtractSettings) (string, error) {
|
||||
return ExtractWithExtraExtractors(filename, r, settings, []Extractor{})
|
||||
func Extract(logger mlog.LoggerIFace, filename string, r io.ReadSeeker, settings ExtractSettings) (string, error) {
|
||||
return ExtractWithExtraExtractors(logger, filename, r, settings, []Extractor{})
|
||||
}
|
||||
|
||||
// ExtractWithExtraExtractors extract the text from a document using the provided extractors beside the system default extractors.
|
||||
func ExtractWithExtraExtractors(filename string, r io.ReadSeeker, settings ExtractSettings, extraExtractors []Extractor) (string, error) {
|
||||
enabledExtractors := &combineExtractor{}
|
||||
func ExtractWithExtraExtractors(logger mlog.LoggerIFace, filename string, r io.ReadSeeker, settings ExtractSettings, extraExtractors []Extractor) (string, error) {
|
||||
enabledExtractors := &combineExtractor{
|
||||
logger: logger,
|
||||
}
|
||||
for _, extraExtractor := range extraExtractors {
|
||||
enabledExtractors.Add(extraExtractor)
|
||||
}
|
||||
|
@ -13,10 +13,13 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/mattermost/mattermost/server/public/shared/mlog"
|
||||
"github.com/mattermost/mattermost/server/v8/channels/utils/testutils"
|
||||
)
|
||||
|
||||
func TestExtract(t *testing.T) {
|
||||
logger := mlog.CreateConsoleTestLogger(t)
|
||||
|
||||
testCases := []struct {
|
||||
Name string
|
||||
TestFileName string
|
||||
@ -127,7 +130,7 @@ func TestExtract(t *testing.T) {
|
||||
t.Run(tc.Name, func(t *testing.T) {
|
||||
data, err := testutils.ReadTestFile(tc.TestFileName)
|
||||
require.NoError(t, err)
|
||||
text, err := Extract(tc.TestFileName, bytes.NewReader(data), tc.Settings)
|
||||
text, err := Extract(logger, tc.TestFileName, bytes.NewReader(data), tc.Settings)
|
||||
if tc.ExpectError {
|
||||
require.Error(t, err)
|
||||
} else {
|
||||
@ -145,7 +148,7 @@ func TestExtract(t *testing.T) {
|
||||
t.Run("Unsupported binary file", func(t *testing.T) {
|
||||
data, err := testutils.ReadTestFile("testjpg.jpg")
|
||||
require.NoError(t, err)
|
||||
text, err := Extract("testjpg.jpg", bytes.NewReader(data), ExtractSettings{})
|
||||
text, err := Extract(logger, "testjpg.jpg", bytes.NewReader(data), ExtractSettings{})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "", text)
|
||||
})
|
||||
@ -153,7 +156,7 @@ func TestExtract(t *testing.T) {
|
||||
t.Run("Wrong docx extension", func(t *testing.T) {
|
||||
data, err := testutils.ReadTestFile("sample-doc.pdf")
|
||||
require.NoError(t, err)
|
||||
text, err := Extract("sample-doc.docx", bytes.NewReader(data), ExtractSettings{})
|
||||
text, err := Extract(logger, "sample-doc.docx", bytes.NewReader(data), ExtractSettings{})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "", text)
|
||||
})
|
||||
@ -161,6 +164,10 @@ func TestExtract(t *testing.T) {
|
||||
|
||||
type customTestPdfExtractor struct{}
|
||||
|
||||
func (te *customTestPdfExtractor) Name() string {
|
||||
return "customTestPdfExtractor"
|
||||
}
|
||||
|
||||
func (te *customTestPdfExtractor) Match(filename string) bool {
|
||||
return strings.HasSuffix(filename, ".pdf")
|
||||
}
|
||||
@ -171,6 +178,10 @@ func (te *customTestPdfExtractor) Extract(filename string, r io.ReadSeeker) (str
|
||||
|
||||
type failingExtractor struct{}
|
||||
|
||||
func (te *failingExtractor) Name() string {
|
||||
return "failingExtractor"
|
||||
}
|
||||
|
||||
func (te *failingExtractor) Match(filename string) bool {
|
||||
return true
|
||||
}
|
||||
@ -180,11 +191,12 @@ func (te *failingExtractor) Extract(filename string, r io.ReadSeeker) (string, e
|
||||
}
|
||||
|
||||
func TestExtractWithExtraExtractors(t *testing.T) {
|
||||
logger := mlog.CreateConsoleTestLogger(t)
|
||||
t.Run("override existing extractor", func(t *testing.T) {
|
||||
data, err := testutils.ReadTestFile("sample-doc.pdf")
|
||||
require.NoError(t, err)
|
||||
|
||||
text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}})
|
||||
text, err := ExtractWithExtraExtractors(logger, "sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&customTestPdfExtractor{}})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, text, "this is a text generated content")
|
||||
})
|
||||
@ -193,7 +205,7 @@ func TestExtractWithExtraExtractors(t *testing.T) {
|
||||
data, err := testutils.ReadTestFile("sample-doc.pdf")
|
||||
require.NoError(t, err)
|
||||
|
||||
text, err := ExtractWithExtraExtractors("sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}})
|
||||
text, err := ExtractWithExtraExtractors(logger, "sample-doc.pdf", bytes.NewReader(data), ExtractSettings{}, []Extractor{&failingExtractor{}})
|
||||
require.NoError(t, err)
|
||||
assert.Contains(t, text, "simple")
|
||||
assert.Contains(t, text, "document")
|
||||
|
@ -26,6 +26,10 @@ var doconvConverterByExtensions = map[string]func(io.Reader) (string, map[string
|
||||
"pdf": docconv.ConvertPDF,
|
||||
}
|
||||
|
||||
func (de *documentExtractor) Name() string {
|
||||
return "documentExtractor"
|
||||
}
|
||||
|
||||
func (de *documentExtractor) Match(filename string) bool {
|
||||
extension := strings.TrimPrefix(path.Ext(filename), ".")
|
||||
_, ok := doconvConverterByExtensions[extension]
|
||||
|
@ -11,4 +11,5 @@ import (
|
||||
type Extractor interface {
|
||||
Match(filename string) bool
|
||||
Extract(filename string, file io.ReadSeeker) (string, error)
|
||||
Name() string
|
||||
}
|
||||
|
@ -36,6 +36,10 @@ func newMMPreviewExtractor(url string, secret string, pdfExtractor pdfExtractor)
|
||||
return &mmPreviewExtractor{url: url, secret: secret, pdfExtractor: pdfExtractor}
|
||||
}
|
||||
|
||||
func (mpe *mmPreviewExtractor) Name() string {
|
||||
return "mmPreviewExtractor"
|
||||
}
|
||||
|
||||
func (mpe *mmPreviewExtractor) Match(filename string) bool {
|
||||
extension := strings.TrimPrefix(path.Ext(filename), ".")
|
||||
return mmpreviewSupportedExtensions[extension]
|
||||
|
@ -17,6 +17,10 @@ import (
|
||||
|
||||
type pdfExtractor struct{}
|
||||
|
||||
func (pe *pdfExtractor) Name() string {
|
||||
return "pdfExtractor"
|
||||
}
|
||||
|
||||
func (pe *pdfExtractor) Match(filename string) bool {
|
||||
supportedExtensions := map[string]bool{
|
||||
"pdf": true,
|
||||
|
@ -11,6 +11,10 @@ import (
|
||||
|
||||
type plainExtractor struct{}
|
||||
|
||||
func (pe *plainExtractor) Name() string {
|
||||
return "plainExtractor"
|
||||
}
|
||||
|
||||
func (pe *plainExtractor) Match(filename string) bool {
|
||||
return true
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user