From 645ba23824841c6d8e18c93ce98ccdf0742dda33 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Mon, 9 Jan 2023 21:27:26 +0000 Subject: [PATCH 1/4] Add cron method to gc LFS MetaObjects This PR adds a task to the cron service to allow garbage collection of LFS meta objects. As repositories may have a large number of LFSMetaObjects, an updated column is added to this table and it is used to perform a generational GC to attempt to reduce the amount of work. (There may need to be a bit more work here but this is probably enough for the moment.) Fix #7045 Signed-off-by: Andrew Thornton --- custom/conf/app.example.ini | 17 +++++ .../doc/advanced/config-cheat-sheet.en-us.md | 8 ++ models/git/lfs.go | 24 +++++- models/migrations/migrations.go | 5 ++ models/migrations/v1_19/v238.go | 26 +++++++ modules/doctor/lfs.go | 16 +++- services/cron/tasks_extended.go | 39 ++++++++++ services/repository/lfs.go | 74 ++++++++++++++----- 8 files changed, 187 insertions(+), 22 deletions(-) create mode 100644 models/migrations/v1_19/v238.go diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini index cec5e8cf03821..81ae6030b296f 100644 --- a/custom/conf/app.example.ini +++ b/custom/conf/app.example.ini @@ -2172,6 +2172,23 @@ ROUTER = console ;SCHEDULE = @every 168h ;OLDER_THAN = 8760h +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Garbage collect LFS pointers in repositories +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;[cron.gc_lfs] +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;ENABLED = false +;; Garbage collect LFS pointers in repositories (default false) +;RUN_AT_START = false +;; Interval as a duration between each gc run (default every 24h) +;SCHEDULE = @every 24h +;; Only attempt to garbage collect LFSMetaObjects older than this (default 7 days) +;OLDER_THAN = 168h +;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days) +;LAST_UPDATED_MORE_THAN_AGO = 72h + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;; Git Operation timeout in seconds diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md index 3b2ff4cbbf1f0..f5011abc4894e 100644 --- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md +++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md @@ -1025,6 +1025,14 @@ Default templates for project boards: - `SCHEDULE`: **@every 168h**: Cron syntax to set how often to check. - `OLDER_THAN`: **@every 8760h**: any system notice older than this expression will be deleted from database. +#### Cron - Garbage collect LFS pointers in repositories ('cron.gc_lfs') + +- `ENABLED`: **false**: Enable service. +- `RUN_AT_START`: **false**: Run tasks at start up time (if ENABLED). +- `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check. +- `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days) +- `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days) + ## Git (`git`) - `PATH`: **""**: The path of Git executable. If empty, Gitea searches through the PATH environment. diff --git a/models/git/lfs.go b/models/git/lfs.go index 34942646888c4..8ce48e2b7a132 100644 --- a/models/git/lfs.go +++ b/models/git/lfs.go @@ -115,6 +115,7 @@ type LFSMetaObject struct { RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` Existing bool `xorm:"-"` CreatedUnix timeutil.TimeStamp `xorm:"created"` + UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` } func init() { @@ -335,7 +336,8 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) { } type IterateLFSMetaObjectsForRepoOptions struct { - OlderThan time.Time + OlderThan time.Time + UpdatedLessRecentlyThan time.Time } // IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo @@ -348,6 +350,8 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont LFSMetaObject } + id := int64(0) + for { beans := make([]*CountLFSMetaObject, 0, batchSize) // SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id @@ -357,7 +361,12 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont if !opts.OlderThan.IsZero() { sess.And("`lfs_meta_object`.created_unix < ?", opts.OlderThan) } + if !opts.UpdatedLessRecentlyThan.IsZero() { + sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan) + } + sess.And("`lfs_meta_object`.id > ?", id) sess.GroupBy("`lfs_meta_object`.id") + sess.OrderBy("`lfs_meta_object`.id ASC") if err := sess.Limit(batchSize, start).Find(&beans); err != nil { return err } @@ -371,5 +380,18 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont return err } } + id = beans[len(beans)-1].ID + } +} + +// MarkLFSMetaObject updates the updated time for the provided LFSMetaObject +func MarkLFSMetaObject(ctx context.Context, id int64) error { + obj := &LFSMetaObject{ + UpdatedUnix: timeutil.TimeStampNow(), + } + count, err := db.GetEngine(ctx).ID(id).Update(obj) + if count != 1 { + log.Error("Unexpectedly updated %d LFSMetaObjects with ID: %d", count, id) } + return err } diff --git a/models/migrations/migrations.go b/models/migrations/migrations.go index 9d9c8f5165e47..4e211617c0ff9 100644 --- a/models/migrations/migrations.go +++ b/models/migrations/migrations.go @@ -432,6 +432,9 @@ var migrations = []Migration{ NewMigration("Update counts of all open milestones", v1_18.UpdateOpenMilestoneCounts), // v230 -> v231 NewMigration("Add ConfidentialClient column (default true) to OAuth2Application table", v1_18.AddConfidentialClientColumnToOAuth2ApplicationTable), + + // Gitea 1.18.0 ends at v231 + // v231 -> v232 NewMigration("Add index for hook_task", v1_19.AddIndexForHookTask), // v232 -> v233 @@ -446,6 +449,8 @@ var migrations = []Migration{ NewMigration("Create secrets table", v1_19.CreateSecretsTable), // v237 -> v238 NewMigration("Drop ForeignReference table", v1_19.DropForeignReferenceTable), + // v238 -> v239 + NewMigration("Add updated unix to LFSMetaObject", v1_19.AddUpdatedUnixToLFSMetaObject), } // GetCurrentDBVersion returns the current db version diff --git a/models/migrations/v1_19/v238.go b/models/migrations/v1_19/v238.go new file mode 100644 index 0000000000000..74e2f1e7e7b8b --- /dev/null +++ b/models/migrations/v1_19/v238.go @@ -0,0 +1,26 @@ +// Copyright 2022 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package v1_19 //nolint + +import ( + "code.gitea.io/gitea/modules/timeutil" + "xorm.io/xorm" +) + +// AddUpdatedUnixToLFSMetaObject adds an updated column to the LFSMetaObject to allow for garbage collection +func AddUpdatedUnixToLFSMetaObject(x *xorm.Engine) error { + // Drop the table introduced in `v211`, it's considered badly designed and doesn't look like to be used. + // See: https://github.com/go-gitea/gitea/issues/21086#issuecomment-1318217453 + // LFSMetaObject stores metadata for LFS tracked files. + type LFSMetaObject struct { + ID int64 `xorm:"pk autoincr"` + Oid string `json:"oid" xorm:"UNIQUE(s) INDEX NOT NULL"` + Size int64 `json:"size" xorm:"NOT NULL"` + RepositoryID int64 `xorm:"UNIQUE(s) INDEX NOT NULL"` + CreatedUnix timeutil.TimeStamp `xorm:"created"` + UpdatedUnix timeutil.TimeStamp `xorm:"INDEX updated"` + } + + return x.Sync(new(LFSMetaObject)) +} diff --git a/modules/doctor/lfs.go b/modules/doctor/lfs.go index 410ed5a9a5f89..63c1b6412ae02 100644 --- a/modules/doctor/lfs.go +++ b/modules/doctor/lfs.go @@ -6,6 +6,7 @@ package doctor import ( "context" "fmt" + "time" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" @@ -29,7 +30,20 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool return fmt.Errorf("LFS support is disabled") } - if err := repository.GarbageCollectLFSMetaObjects(ctx, logger, autofix); err != nil { + if err := repository.GarbageCollectLFSMetaObjects(ctx, repository.GarbageCollectLFSMetaObjectsOptions{ + Logger: logger, + AutoFix: autofix, + // Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload + // and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby + // an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid + // changes in new branches that might lead to lfs objects becoming temporarily unassociated with git + // objects. + // + // It is likely that a week is potentially excessive but it should definitely be enough that any + // unassociated LFS object is genuinely unassociated. + OlderThan: 24 * time.Hour * 7, + // We don't set the UpdatedLessRecentlyThan because we want to do a full GC + }); err != nil { return err } diff --git a/services/cron/tasks_extended.go b/services/cron/tasks_extended.go index 4486be0c2fa7c..474cd771d33b7 100644 --- a/services/cron/tasks_extended.go +++ b/services/cron/tasks_extended.go @@ -175,6 +175,44 @@ func registerDeleteOldSystemNotices() { }) } +func registerGCLFS() { + if !setting.LFS.StartServer { + return + } + type GCLFSConfig struct { + OlderThanConfig + LastUpdatedMoreThanAgo time.Duration + } + + RegisterTaskFatal("delete_old_system_notices", &GCLFSConfig{ + OlderThanConfig: OlderThanConfig{ + BaseConfig: BaseConfig{ + Enabled: false, + RunAtStart: false, + Schedule: "@every 24h", + }, + // Only attempt to garbage collect lfs meta objects older than a week as the order of git lfs upload + // and git object upload is not necessarily guaranteed. It's possible to imagine a situation whereby + // an LFS object is uploaded but the git branch is not uploaded immediately, or there are some rapid + // changes in new branches that might lead to lfs objects becoming temporarily unassociated with git + // objects. + // + // It is likely that a week is potentially excessive but it should definitely be enough that any + // unassociated LFS object is genuinely unassociated. + OlderThan: 24 * time.Hour * 7, + }, + // Only GC things that haven't been looked at in the past 3 days + LastUpdatedMoreThanAgo: 24 * time.Hour * 3, + }, func(ctx context.Context, _ *user_model.User, config Config) error { + gcLFSConfig := config.(*GCLFSConfig) + return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{ + AutoFix: true, + OlderThan: gcLFSConfig.OlderThan, + LastUpdatedMoreThanAgo: gcLFSConfig.LastUpdatedMoreThanAgo, + }) + }) +} + func initExtendedTasks() { registerDeleteInactiveUsers() registerDeleteRepositoryArchives() @@ -188,4 +226,5 @@ func initExtendedTasks() { registerDeleteOldActions() registerUpdateGiteaChecker() registerDeleteOldSystemNotices() + registerGCLFS() } diff --git a/services/repository/lfs.go b/services/repository/lfs.go index 7806e20a9f750..53997d852e473 100644 --- a/services/repository/lfs.go +++ b/services/repository/lfs.go @@ -14,40 +14,63 @@ import ( "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/log" + "code.gitea.io/gitea/modules/setting" "xorm.io/builder" ) -func GarbageCollectLFSMetaObjects(ctx context.Context, logger log.Logger, autofix bool) error { +// GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function +type GarbageCollectLFSMetaObjectsOptions struct { + RepoID int64 + Logger log.Logger + AutoFix bool + OlderThan time.Duration + LastUpdatedMoreThanAgo time.Duration +} + +// GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories +func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMetaObjectsOptions) error { log.Trace("Doing: GarbageCollectLFSMetaObjects") + defer log.Trace("Finished: GarbageCollectLFSMetaObjects") + + if !setting.LFS.StartServer { + if opts.Logger != nil { + opts.Logger.Info("LFS support is disabled") + } + return nil + } + + if opts.RepoID == 0 { + repo, err := repo_model.GetRepositoryByID(ctx, opts.RepoID) + if err != nil { + return err + } + return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) + } - if err := db.Iterate( + return db.Iterate( ctx, builder.And(builder.Gt{"id": 0}), func(ctx context.Context, repo *repo_model.Repository) error { - return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, logger, autofix) + return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) }, - ); err != nil { - return err - } - - log.Trace("Finished: GarbageCollectLFSMetaObjects") - return nil + ) } -func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, logger log.Logger, autofix bool) error { - if logger != nil { - logger.Info("Checking %-v", repo) +// GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository +func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.Repository, opts GarbageCollectLFSMetaObjectsOptions) error { + if opts.Logger != nil { + opts.Logger.Info("Checking %-v", repo) } total, orphaned, collected, deleted := 0, 0, 0, 0 - if logger != nil { + if opts.Logger != nil { defer func() { if orphaned == 0 { - logger.Info("Found %d total LFSMetaObjects in %-v", total, repo) - } else if !autofix { - logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo) + opts.Logger.Info("Found %d total LFSMetaObjects in %-v", total, repo) + } else if !opts.AutoFix { + opts.Logger.Info("Found %d/%d orphaned LFSMetaObjects in %-v", orphaned, total, repo) } else { - logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted) + opts.Logger.Info("Collected %d/%d orphaned/%d total LFSMetaObjects in %-v. %d removed from storage.", collected, orphaned, total, repo, deleted) } }() } @@ -61,16 +84,26 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R store := lfs.NewContentStore() + var olderThan time.Time + var updatedLessRecentlyThan time.Time + + if opts.OlderThan > 0 { + olderThan = time.Now().Add(opts.OlderThan) + } + if opts.LastUpdatedMoreThanAgo > 0 { + updatedLessRecentlyThan = time.Now().Add(opts.LastUpdatedMoreThanAgo) + } + return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error { total++ pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent())) if gitRepo.IsObjectExist(pointerSha.String()) { - return nil + return git_model.MarkLFSMetaObject(ctx, metaObject.ID) } orphaned++ - if !autofix { + if !opts.AutoFix { return nil } // Non-existent pointer file @@ -100,6 +133,7 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R // // It is likely that a week is potentially excessive but it should definitely be enough that any // unassociated LFS object is genuinely unassociated. - OlderThan: time.Now().Add(-24 * 7 * time.Hour), + OlderThan: olderThan, + UpdatedLessRecentlyThan: updatedLessRecentlyThan, }) } From 35c7906dd51f0877a2f2db1b553d11c4d6d74b36 Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 10 Jan 2023 02:44:21 +0000 Subject: [PATCH 2/4] Improve generational GC and only check repos with LFSMetaObjects Signed-off-by: Andrew Thornton --- custom/conf/app.example.ini | 5 ++ .../doc/advanced/config-cheat-sheet.en-us.md | 2 + models/git/lfs.go | 53 +++++++++++++-- modules/doctor/lfs.go | 2 +- services/cron/tasks_extended.go | 16 +++-- services/repository/lfs.go | 67 ++++++++++--------- 6 files changed, 99 insertions(+), 46 deletions(-) diff --git a/custom/conf/app.example.ini b/custom/conf/app.example.ini index 81ae6030b296f..105aa2a33ffa6 100644 --- a/custom/conf/app.example.ini +++ b/custom/conf/app.example.ini @@ -2188,6 +2188,11 @@ ROUTER = console ;OLDER_THAN = 168h ;; Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days) ;LAST_UPDATED_MORE_THAN_AGO = 72h +; Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all. +;NUMBER_TO_CHECK_PER_REPO = 100 +;Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.) +;PROPORTION_TO_CHECK_PER_REPO = 0.6 + ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; diff --git a/docs/content/doc/advanced/config-cheat-sheet.en-us.md b/docs/content/doc/advanced/config-cheat-sheet.en-us.md index f5011abc4894e..90a695e3ff2f7 100644 --- a/docs/content/doc/advanced/config-cheat-sheet.en-us.md +++ b/docs/content/doc/advanced/config-cheat-sheet.en-us.md @@ -1032,6 +1032,8 @@ Default templates for project boards: - `SCHEDULE`: **@every 24h**: Cron syntax to set how often to check. - `OLDER_THAN`: **168h**: Only attempt to garbage collect LFSMetaObjects older than this (default 7 days) - `LAST_UPDATED_MORE_THAN_AGO`: **72h**: Only attempt to garbage collect LFSMetaObjects that have not been attempted to be garbage collected for this long (default 3 days) +- `NUMBER_TO_CHECK_PER_REPO`: **100**: Minimum number of stale LFSMetaObjects to check per repo. Set to `0` to always check all. +- `PROPORTION_TO_CHECK_PER_REPO`: **0.6**: Check at least this proportion of LFSMetaObjects per repo. (This may cause all stale LFSMetaObjects to be checked.) ## Git (`git`) diff --git a/models/git/lfs.go b/models/git/lfs.go index 8ce48e2b7a132..0ba8e919d0ecb 100644 --- a/models/git/lfs.go +++ b/models/git/lfs.go @@ -335,9 +335,45 @@ func GetRepoLFSSize(ctx context.Context, repoID int64) (int64, error) { return lfsSize, nil } +// IterateRepositoryIDsWithLFSMetaObjects iterates across the repositories that have LFSMetaObjects +func IterateRepositoryIDsWithLFSMetaObjects(ctx context.Context, f func(ctx context.Context, repoID, count int64) error) error { + batchSize := setting.Database.IterateBufferSize + sess := db.GetEngine(ctx) + id := int64(0) + type RepositoryCount struct { + RepositoryID int64 + Count int64 + } + for { + counts := make([]*RepositoryCount, 0, batchSize) + sess.Select("repository_id, COUNT(id) AS count"). + Table("lfs_meta_object"). + Where("repository_id > ?", id). + GroupBy("repository_id"). + OrderBy("repository_id ASC") + + if err := sess.Limit(batchSize, 0).Find(&counts); err != nil { + return err + } + if len(counts) == 0 { + return nil + } + + for _, count := range counts { + if err := f(ctx, count.RepositoryID, count.Count); err != nil { + return err + } + } + id = counts[len(counts)-1].RepositoryID + } +} + +// IterateLFSMetaObjectsForRepoOptions provides options for IterateLFSMetaObjectsForRepo type IterateLFSMetaObjectsForRepoOptions struct { - OlderThan time.Time - UpdatedLessRecentlyThan time.Time + OlderThan time.Time + UpdatedLessRecentlyThan time.Time + OrderByUpdated bool + LoopFunctionAlwaysUpdates bool } // IterateLFSMetaObjectsForRepo provides a iterator for LFSMetaObjects per Repo @@ -354,7 +390,6 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont for { beans := make([]*CountLFSMetaObject, 0, batchSize) - // SELECT `lfs_meta_object`.*, COUNT(`l1`.id) as `count` FROM lfs_meta_object INNER JOIN lfs_meta_object AS l1 ON l1.oid = lfs_meta_object.oid WHERE lfs_meta_object.repository_id = ? GROUP BY lfs_meta_object.id sess := engine.Select("`lfs_meta_object`.*, COUNT(`l1`.oid) AS `count`"). Join("INNER", "`lfs_meta_object` AS l1", "`lfs_meta_object`.oid = `l1`.oid"). Where("`lfs_meta_object`.repository_id = ?", repoID) @@ -364,16 +399,22 @@ func IterateLFSMetaObjectsForRepo(ctx context.Context, repoID int64, f func(cont if !opts.UpdatedLessRecentlyThan.IsZero() { sess.And("`lfs_meta_object`.updated_unix < ?", opts.UpdatedLessRecentlyThan) } - sess.And("`lfs_meta_object`.id > ?", id) sess.GroupBy("`lfs_meta_object`.id") - sess.OrderBy("`lfs_meta_object`.id ASC") + if opts.OrderByUpdated { + sess.OrderBy("`lfs_meta_object`.updated_unix ASC") + } else { + sess.And("`lfs_meta_object`.id > ?", id) + sess.OrderBy("`lfs_meta_object`.id ASC") + } if err := sess.Limit(batchSize, start).Find(&beans); err != nil { return err } if len(beans) == 0 { return nil } - start += len(beans) + if !opts.LoopFunctionAlwaysUpdates { + start += len(beans) + } for _, bean := range beans { if err := f(ctx, &bean.LFSMetaObject, bean.Count); err != nil { diff --git a/modules/doctor/lfs.go b/modules/doctor/lfs.go index 63c1b6412ae02..64ee4c40bfebf 100644 --- a/modules/doctor/lfs.go +++ b/modules/doctor/lfs.go @@ -41,7 +41,7 @@ func garbageCollectLFSCheck(ctx context.Context, logger log.Logger, autofix bool // // It is likely that a week is potentially excessive but it should definitely be enough that any // unassociated LFS object is genuinely unassociated. - OlderThan: 24 * time.Hour * 7, + OlderThan: time.Now().Add(-24 * time.Hour * 7), // We don't set the UpdatedLessRecentlyThan because we want to do a full GC }); err != nil { return err diff --git a/services/cron/tasks_extended.go b/services/cron/tasks_extended.go index 474cd771d33b7..520d940edf3c5 100644 --- a/services/cron/tasks_extended.go +++ b/services/cron/tasks_extended.go @@ -181,10 +181,12 @@ func registerGCLFS() { } type GCLFSConfig struct { OlderThanConfig - LastUpdatedMoreThanAgo time.Duration + LastUpdatedMoreThanAgo time.Duration + NumberToCheckPerRepo int64 + ProportionToCheckPerRepo float64 } - RegisterTaskFatal("delete_old_system_notices", &GCLFSConfig{ + RegisterTaskFatal("gc_lfs", &GCLFSConfig{ OlderThanConfig: OlderThanConfig{ BaseConfig: BaseConfig{ Enabled: false, @@ -202,13 +204,15 @@ func registerGCLFS() { OlderThan: 24 * time.Hour * 7, }, // Only GC things that haven't been looked at in the past 3 days - LastUpdatedMoreThanAgo: 24 * time.Hour * 3, + LastUpdatedMoreThanAgo: 24 * time.Hour * 3, + NumberToCheckPerRepo: 100, + ProportionToCheckPerRepo: 0.6, }, func(ctx context.Context, _ *user_model.User, config Config) error { gcLFSConfig := config.(*GCLFSConfig) return repo_service.GarbageCollectLFSMetaObjects(ctx, repo_service.GarbageCollectLFSMetaObjectsOptions{ - AutoFix: true, - OlderThan: gcLFSConfig.OlderThan, - LastUpdatedMoreThanAgo: gcLFSConfig.LastUpdatedMoreThanAgo, + AutoFix: true, + OlderThan: time.Now().Add(-gcLFSConfig.OlderThan), + UpdatedLessRecentlyThan: time.Now().Add(-gcLFSConfig.LastUpdatedMoreThanAgo), }) }) } diff --git a/services/repository/lfs.go b/services/repository/lfs.go index 53997d852e473..aeb808a72f330 100644 --- a/services/repository/lfs.go +++ b/services/repository/lfs.go @@ -5,27 +5,26 @@ package repository import ( "context" + "errors" "fmt" "time" - "code.gitea.io/gitea/models/db" git_model "code.gitea.io/gitea/models/git" repo_model "code.gitea.io/gitea/models/repo" "code.gitea.io/gitea/modules/git" "code.gitea.io/gitea/modules/lfs" "code.gitea.io/gitea/modules/log" "code.gitea.io/gitea/modules/setting" - - "xorm.io/builder" ) // GarbageCollectLFSMetaObjectsOptions provides options for GarbageCollectLFSMetaObjects function type GarbageCollectLFSMetaObjectsOptions struct { - RepoID int64 - Logger log.Logger - AutoFix bool - OlderThan time.Duration - LastUpdatedMoreThanAgo time.Duration + Logger log.Logger + AutoFix bool + OlderThan time.Time + UpdatedLessRecentlyThan time.Time + NumberToCheckPerRepo int64 + ProportionToCheckPerRepo float64 } // GarbageCollectLFSMetaObjects garbage collects LFS objects for all repositories @@ -40,21 +39,17 @@ func GarbageCollectLFSMetaObjects(ctx context.Context, opts GarbageCollectLFSMet return nil } - if opts.RepoID == 0 { - repo, err := repo_model.GetRepositoryByID(ctx, opts.RepoID) + return git_model.IterateRepositoryIDsWithLFSMetaObjects(ctx, func(ctx context.Context, repoID, count int64) error { + repo, err := repo_model.GetRepositoryByID(ctx, repoID) if err != nil { return err } - return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) - } - return db.Iterate( - ctx, - builder.And(builder.Gt{"id": 0}), - func(ctx context.Context, repo *repo_model.Repository) error { - return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) - }, - ) + if newMinimum := int64(float64(count) * opts.ProportionToCheckPerRepo); newMinimum > opts.NumberToCheckPerRepo && opts.NumberToCheckPerRepo != 0 { + opts.NumberToCheckPerRepo = newMinimum + } + return GarbageCollectLFSMetaObjectsForRepo(ctx, repo, opts) + }) } // GarbageCollectLFSMetaObjectsForRepo garbage collects LFS objects for a specific repository @@ -62,7 +57,7 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R if opts.Logger != nil { opts.Logger.Info("Checking %-v", repo) } - total, orphaned, collected, deleted := 0, 0, 0, 0 + total, orphaned, collected, deleted := int64(0), 0, 0, 0 if opts.Logger != nil { defer func() { if orphaned == 0 { @@ -83,18 +78,12 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R defer gitRepo.Close() store := lfs.NewContentStore() + errStop := errors.New("STOPERR") - var olderThan time.Time - var updatedLessRecentlyThan time.Time - - if opts.OlderThan > 0 { - olderThan = time.Now().Add(opts.OlderThan) - } - if opts.LastUpdatedMoreThanAgo > 0 { - updatedLessRecentlyThan = time.Now().Add(opts.LastUpdatedMoreThanAgo) - } - - return git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error { + err = git_model.IterateLFSMetaObjectsForRepo(ctx, repo.ID, func(ctx context.Context, metaObject *git_model.LFSMetaObject, count int64) error { + if opts.NumberToCheckPerRepo > 0 && total > opts.NumberToCheckPerRepo { + return errStop + } total++ pointerSha := git.ComputeBlobHash([]byte(metaObject.Pointer.StringContent())) @@ -133,7 +122,19 @@ func GarbageCollectLFSMetaObjectsForRepo(ctx context.Context, repo *repo_model.R // // It is likely that a week is potentially excessive but it should definitely be enough that any // unassociated LFS object is genuinely unassociated. - OlderThan: olderThan, - UpdatedLessRecentlyThan: updatedLessRecentlyThan, + OlderThan: opts.OlderThan, + UpdatedLessRecentlyThan: opts.UpdatedLessRecentlyThan, + OrderByUpdated: true, + LoopFunctionAlwaysUpdates: true, }) + + if err == errStop { + if opts.Logger != nil { + opts.Logger.Info("Processing stopped at %d total LFSMetaObjects in %-v", total, repo) + } + return nil + } else if err != nil { + return err + } + return nil } From 6fc1a0e81a971f5d7de4c4c3c76d59ccbafd3eea Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 10 Jan 2023 02:45:10 +0000 Subject: [PATCH 3/4] fmt! Signed-off-by: Andrew Thornton --- models/migrations/v1_19/v238.go | 1 + 1 file changed, 1 insertion(+) diff --git a/models/migrations/v1_19/v238.go b/models/migrations/v1_19/v238.go index 74e2f1e7e7b8b..266e6cea58a8a 100644 --- a/models/migrations/v1_19/v238.go +++ b/models/migrations/v1_19/v238.go @@ -5,6 +5,7 @@ package v1_19 //nolint import ( "code.gitea.io/gitea/modules/timeutil" + "xorm.io/xorm" ) From ea5306079bdc43c9b10d889a7b5401ed143fe11f Mon Sep 17 00:00:00 2001 From: Andrew Thornton Date: Tue, 10 Jan 2023 09:10:07 +0000 Subject: [PATCH 4/4] add locale entry Signed-off-by: Andrew Thornton --- options/locale/locale_en-US.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/options/locale/locale_en-US.ini b/options/locale/locale_en-US.ini index ddc0ee25d51e4..a5260f3c3bdba 100644 --- a/options/locale/locale_en-US.ini +++ b/options/locale/locale_en-US.ini @@ -2548,6 +2548,7 @@ dashboard.delete_old_actions = Delete all old actions from database dashboard.delete_old_actions.started = Delete all old actions from database started. dashboard.update_checker = Update checker dashboard.delete_old_system_notices = Delete all old system notices from database +dashboard.gc_lfs = Garbage collect LFS meta objects users.user_manage_panel = User Account Management users.new_account = Create User Account