backblaze-backup/internal/services/backblaze.go

459 lines
10 KiB
Go
Raw Normal View History

2023-07-10 11:59:37 +02:00
package services
import (
"context"
"errors"
"fmt"
"io"
"io/fs"
"log"
"os"
"path/filepath"
"runtime"
2023-07-18 19:01:16 +02:00
"strconv"
"strings"
2023-07-10 11:59:37 +02:00
"sync"
2023-08-01 13:41:20 +02:00
"time"
2023-07-10 11:59:37 +02:00
2023-07-18 19:01:16 +02:00
rcloneb2 "github.com/rclone/rclone/backend/b2"
2023-07-10 11:59:37 +02:00
"github.com/kurin/blazer/b2"
2023-07-18 19:01:16 +02:00
"github.com/rclone/rclone/fs/config/configmap"
"github.com/rclone/rclone/fs/operations"
2023-07-10 11:59:37 +02:00
"golang.org/x/sync/semaphore"
)
2023-08-01 13:11:19 +02:00
const writers = 20
const maxConcurrentWeight = 10
2023-07-10 11:59:37 +02:00
const largeFileSize = 500 * 1024 * 1024 // 500 MB
2023-08-01 13:41:20 +02:00
type UploadMessage struct {
key string
startAt time.Time
endAt time.Time
}
2023-07-10 11:59:37 +02:00
type BackBalze struct {
bucketName string
dir string
filePath string
maxWorkers int
bbID string
bbKey string
}
func NewBackBlaze(bbID, bbKey string) *BackBalze {
2023-07-13 22:01:18 +02:00
log.Println("runtime.NumCPU()", runtime.NumCPU())
2023-07-10 11:59:37 +02:00
return &BackBalze{
bbID: bbID,
bbKey: bbKey,
2023-07-27 09:46:14 +02:00
maxWorkers: runtime.NumCPU() * 3,
2023-07-10 11:59:37 +02:00
}
}
func (b *BackBalze) WithBucket(bucketName string) *BackBalze {
b.bucketName = bucketName
return b
}
func (b *BackBalze) WithDir(dir string) *BackBalze {
b.dir = dir
return b
}
func (b *BackBalze) WithFile(filePath string) *BackBalze {
b.filePath = filePath
return b
}
func (b *BackBalze) Sync(ctx context.Context) error {
2023-08-01 13:41:20 +02:00
msgsChan := make(chan UploadMessage)
2023-07-10 11:59:37 +02:00
if b.bucketName == "" && (b.filePath == "" || b.dir == "") {
return fmt.Errorf("bucket name is %v | filePath is %v | dir is %v", b.bucketName, b.filePath, b.dir)
}
if b.filePath != "" && b.dir != "" {
return errors.New("you must select just 1 option, dir or file")
}
b2Client, err := b2.NewClient(ctx, b.bbID, b.bbKey)
if err != nil {
return fmt.Errorf("b2.NewClient %w", err)
}
log.Println("b2Client ok")
bc, err := b2Client.Bucket(ctx, b.bucketName)
if err != nil {
return fmt.Errorf("b2Client.Bucket %w", err)
}
if bc == nil {
return fmt.Errorf("bucket doesn't exist %s", b.bucketName)
}
log.Println("bucket found:", bc.Name())
if b.filePath != "" {
log.Println("file:", b.filePath)
2023-08-01 13:41:20 +02:00
if _, err := copyFile(ctx, bc, b.filePath); err != nil {
2023-07-10 11:59:37 +02:00
return fmt.Errorf("copyFile %w", err)
}
return nil
}
if b.dir != "" {
oldFiles, err := bucketFiles(ctx, bc)
2023-07-13 22:01:18 +02:00
if err != nil {
return fmt.Errorf("bucketFiles %w", err)
}
log.Println(strings.Repeat("*", 40))
log.Println("oldFiles to clean:\n\t\t" + strings.Join(oldFiles, "\n\t\t"))
log.Println(strings.Repeat("*", 40))
fileChan := make(chan string)
var wg sync.WaitGroup
2023-08-01 13:41:20 +02:00
go func() {
for m := range msgsChan {
log.Printf("\n\t%s:\n\tstart %s \n\tend %s\n", m.key, m.startAt.Format(time.RFC3339Nano), m.endAt.Format(time.RFC3339Nano))
}
}()
2023-07-13 22:01:18 +02:00
for i := 0; i < b.maxWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for src := range fileChan {
2023-08-01 13:41:20 +02:00
msg, err := copyFile(ctx, bc, src)
if err != nil {
2023-07-13 22:01:18 +02:00
log.Printf("error copying file %s: %v\n", src, err)
2023-08-06 16:40:01 +02:00
continue
2023-07-13 22:01:18 +02:00
}
2023-08-01 13:41:20 +02:00
msgsChan <- msg
2023-07-13 22:01:18 +02:00
}
}()
}
// Walk the directory and send files to the channel for uploading
err = filepath.WalkDir(b.dir, func(path string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if !d.IsDir() {
fileChan <- path
}
return nil
})
if err != nil {
return fmt.Errorf("error walking the directory: %v", err)
}
// Close the channel (no more files to send)
close(fileChan)
wg.Wait()
2023-08-01 13:41:20 +02:00
close(msgsChan)
2023-07-13 22:01:18 +02:00
// Cleanup old files after backup is completed
if err := cleanBucket(ctx, bc, oldFiles); err != nil {
return fmt.Errorf("cleanBucket %w", err)
}
}
log.Println("copied successfully")
return nil
}
func (b *BackBalze) OldSync() error {
if b.bucketName == "" && (b.filePath == "" || b.dir == "") {
return fmt.Errorf("bucket name is %v | filePath is %v | dir is %v", b.bucketName, b.filePath, b.dir)
}
if b.filePath != "" && b.dir != "" {
return errors.New("you must select just 1 option, dir or file")
}
ctx := context.Background()
b2Client, err := b2.NewClient(ctx, b.bbID, b.bbKey)
if err != nil {
return fmt.Errorf("b2.NewClient %w", err)
}
log.Println("b2Client ok")
bc, err := b2Client.Bucket(ctx, b.bucketName)
if err != nil {
return fmt.Errorf("b2Client.Bucket %w", err)
}
if bc == nil {
return fmt.Errorf("bucket doesn't exist %s", b.bucketName)
}
log.Println("bucket found:", bc.Name())
if b.filePath != "" {
log.Println("file:", b.filePath)
2023-08-01 13:41:20 +02:00
if _, err := copyFile(ctx, bc, b.filePath); err != nil {
2023-07-13 22:01:18 +02:00
return fmt.Errorf("copyFile %w", err)
}
return nil
}
if b.dir != "" {
oldFiles, err := bucketFiles(ctx, bc)
if err != nil {
return fmt.Errorf("bucketFiles %w", err)
2023-07-10 11:59:37 +02:00
}
log.Println(strings.Repeat("*", 40))
log.Println("oldFiles to clean:\n\t\t" + strings.Join(oldFiles, "\n\t\t"))
log.Println(strings.Repeat("*", 40))
2023-07-10 11:59:37 +02:00
fileChan := make(chan string)
uploadSem := semaphore.NewWeighted(maxConcurrentWeight)
var wg sync.WaitGroup
for i := 0; i < b.maxWorkers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for src := range fileChan {
info, err := os.Stat(src)
if err != nil {
log.Printf("error getting file info %s: %v\n", src, err)
continue
}
weight := int64(1)
if info.Size() > largeFileSize {
weight = 2
}
if err := uploadSem.Acquire(ctx, weight); err == nil {
log.Println("start copying file", src)
2023-08-01 13:41:20 +02:00
if _, err := copyFile(ctx, bc, src); err != nil {
2023-07-10 11:59:37 +02:00
log.Printf("error copying file %s: %v\n", src, err)
}
uploadSem.Release(weight)
} else {
log.Printf("error acquiring semaphore: %v\n", err)
}
}
}()
}
// Walk the directory and send files to the channel for uploading
err = filepath.WalkDir(b.dir, func(path string, d fs.DirEntry, err error) error {
2023-07-10 11:59:37 +02:00
if err != nil {
return err
}
if !d.IsDir() {
fileChan <- path
}
return nil
})
if err != nil {
return fmt.Errorf("error walking the directory: %v", err)
}
// Close the channel (no more files to send)
close(fileChan)
wg.Wait()
// Cleanup old files after backup is completed
if err := cleanBucket(ctx, bc, oldFiles); err != nil {
return fmt.Errorf("cleanBucket %w", err)
}
2023-07-10 11:59:37 +02:00
}
log.Println("copied successfully")
return nil
}
2023-08-06 16:40:01 +02:00
func copyFile(ctx context.Context, bucket *b2.Bucket, src string) error {
2023-07-10 11:59:37 +02:00
f, err := os.Open(src)
if err != nil {
2023-08-06 16:40:01 +02:00
return err
2023-07-10 11:59:37 +02:00
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
2023-08-06 16:40:01 +02:00
return err
2023-07-10 11:59:37 +02:00
}
w := bucket.Object(fi.Name()).NewWriter(ctx)
w.ConcurrentUploads = writers
2023-08-06 16:40:01 +02:00
// w.ChunkSize = 1e9 / 2
w.UseFileBuffer = true
log.Println("start copying", fi.Name())
2023-07-10 11:59:37 +02:00
if _, err := io.Copy(w, f); err != nil {
w.Close()
2023-08-06 16:40:01 +02:00
return err
2023-07-10 11:59:37 +02:00
}
2023-08-06 16:40:01 +02:00
log.Println("end copying", fi.Name())
return w.Close()
2023-07-10 11:59:37 +02:00
}
func cleanBucket(ctx context.Context, bucket *b2.Bucket, files []string) error {
var errorBuilder strings.Builder
for _, v := range files {
obj := bucket.Object(v)
if obj == nil {
log.Println("object is nil", v)
continue
}
if err := obj.Delete(ctx); err != nil {
errorBuilder.WriteString(fmt.Errorf("error deleting %s : %w", v, err).Error())
errorBuilder.WriteString("; ")
}
}
if errorBuilder.Len() > 0 {
return errors.New(errorBuilder.String())
}
return nil
}
func bucketFiles(ctx context.Context, bucket *b2.Bucket) ([]string, error) {
2023-07-10 11:59:37 +02:00
bucketIter := bucket.List(ctx)
if bucketIter == nil {
return nil, fmt.Errorf("bucket list cannot be nil")
2023-07-10 11:59:37 +02:00
}
var files []string
2023-07-10 11:59:37 +02:00
for {
if !bucketIter.Next() {
if bucketIter.Err() != nil {
return nil, fmt.Errorf("bucketIter err %w", bucketIter.Err())
2023-07-10 11:59:37 +02:00
}
break
}
if bucketIter.Object() == nil {
log.Println("bucketIter Object is nil")
continue
}
files = append(files, bucketIter.Object().Name())
2023-07-10 11:59:37 +02:00
}
return files, nil
2023-07-10 11:59:37 +02:00
}
type duplicate struct {
bucket string
file string
count int
}
2023-07-18 15:00:12 +02:00
func (d duplicate) dir() string {
if !strings.Contains(d.file, "/") {
return d.bucket
}
splitted := strings.Split(d.file, "/")
return strings.Join(splitted[:(len(splitted)-1)], "/")
}
func (b *BackBalze) CleanUp(ctx context.Context, cancel context.CancelFunc) error {
b2Client, err := b2.NewClient(ctx, b.bbID, b.bbKey)
if err != nil {
return fmt.Errorf("b2.NewClient %w", err)
}
log.Println("b2Client ok")
dups, err := b.listDuplicates(ctx, cancel, b2Client)
if err != nil {
return fmt.Errorf("b.listDuplicates: %w", err)
}
if len(dups) <= 0 {
return nil
}
for _, d := range dups {
2023-07-18 19:01:16 +02:00
smpl := configmap.Simple{}
smpl.Set("account", b.bbID)
smpl.Set("key", b.bbKey)
smpl.Set("chunk_size", strconv.FormatInt(int64(9600), 10))
f, err := rcloneb2.NewFs(ctx, "B2", d.dir(), smpl)
if err != nil {
return fmt.Errorf("rclonefs.NewFs %w", err)
}
if err := operations.CleanUp(ctx, f); err != nil {
return fmt.Errorf("operations.CleanUp %w", err)
2023-07-18 15:00:12 +02:00
}
}
return nil
}
func (b *BackBalze) ListDuplicateVersions(ctx context.Context, cancel context.CancelFunc) error {
b2Client, err := b2.NewClient(ctx, b.bbID, b.bbKey)
if err != nil {
return fmt.Errorf("b2.NewClient %w", err)
}
log.Println("b2Client ok")
2023-07-18 15:00:12 +02:00
dups, err := b.listDuplicates(ctx, cancel, b2Client)
if err != nil {
return fmt.Errorf("b.listDuplicates: %w", err)
}
if len(dups) > 0 {
var builder strings.Builder
for _, dup := range dups {
builder.WriteString(fmt.Sprintf("%+v\n", dup))
}
return fmt.Errorf("found duplicates: %s", builder.String())
}
return nil
}
func (b *BackBalze) listDuplicates(ctx context.Context, cancel context.CancelFunc, b2Client *b2.Client) ([]duplicate, error) {
buckets, err := b2Client.ListBuckets(ctx)
if err != nil {
2023-07-18 15:00:12 +02:00
return nil, fmt.Errorf("b2Client.Bucket %w", err)
}
wg := sync.WaitGroup{}
dups := make([]duplicate, 0)
log.Println("len(buckets)", len(buckets))
sm := semaphore.NewWeighted(int64(b.maxWorkers))
wg.Add(len(buckets))
for _, bc := range buckets {
if err := sm.Acquire(ctx, 1); err != nil {
2023-07-18 15:00:12 +02:00
return nil, fmt.Errorf("sm.Acquire %w", err)
}
go func(bc *b2.Bucket) {
defer sm.Release(1)
defer wg.Done()
files := make(map[string]int, 0)
bucketIter := bc.List(ctx, b2.ListHidden())
if bucketIter == nil {
log.Println("bucket list cannot be nil")
return
}
for {
if !bucketIter.Next() {
if bucketIter.Err() != nil {
log.Println("bucketIter err %w", bucketIter.Err())
return
}
break
}
if bucketIter.Object() == nil {
log.Println("bucketIter Object is nil")
continue
}
files[bucketIter.Object().Name()]++
}
// Search duplicates
for file, count := range files {
if count > 1 {
dups = append(dups, duplicate{
bucket: bc.Name(),
file: file,
count: count,
})
}
}
}(bc)
}
wg.Wait()
2023-07-18 15:00:12 +02:00
return dups, nil
}