Moved to single file storage (appending chunks to existing file)
continuous-integration/drone/push Build is failing Details

Added file finalize actions
Added error handling for files
Moved everything to pathlib
Simplified models
Squashed all migrations to single operation
This commit is contained in:
Jordi Loyzaga 2024-09-17 01:52:09 -06:00
parent a58f593c07
commit 526e0e7ddc
9 changed files with 149 additions and 241 deletions

1
.gitignore vendored
View File

@ -95,3 +95,4 @@ venv.bak/
lockbox/media
lockbox/staticfiles
TODO.txt
FILES

View File

@ -7,18 +7,15 @@ class UPLOAD_STATUS_TYPES:
UPLOADING = "uploading"
COMPLETED = "completed"
ABANDONED = "abandoned"
PROCESSING = "processing"
ERROR = "error"
class UPLOAD_ERROR_CODES:
FILE_MISSING = "file_missing"
CHUNK_MISMATCH = "chunk_mismatch"
# Config
CONFIG_KEYS = {
"EXPIRATION_DELTA_MINUTES": {
"description": "Date created + this delta at which file expires",
"verbose_name": "File expiration delta (minutes)",
"native_type": int,
"sensitive": False,
"default": 120,
},
"ABANDONED_DELTA_MINUTES": {
"description": "Date created + this delta at which a file is marked as abandoned",
"verbose_name": "Uncompleted file abandoned max age",

View File

@ -120,6 +120,7 @@ STORAGES = {
# Storage
MEDIA_ROOT = Path("/home/kitty/src/lockbox/FILES")
MEDIA_URL = "files/"
INCOMPLETE_EXT = ".incomplete"
validate_paths(MEDIA_ROOT)

View File

@ -1,11 +1,10 @@
import os
from pathlib import Path
# TODO: LOG MEEEEE
# TODO: Figure out file owner in system, permissions, GUID
# Whats the default path if not provided? // docker volume
def validate_paths(media_path):
if not os.path.isdir(media_path):
try:
os.makedirs(media_path)
Path(media_path).mkdir(exist_ok=True)
except Exception as e:
raise e

View File

@ -1,4 +1,4 @@
# Generated by Django 4.2.15 on 2024-09-16 11:24
# Generated by Django 4.2.15 on 2024-09-17 06:52
import common.utils
from django.conf import settings
@ -23,15 +23,16 @@ class Migration(migrations.Migration):
('lid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='lockbox ID')),
('date_created', models.DateTimeField(blank=True, help_text='date at which this object was created', verbose_name='date created')),
('date_updated', models.DateTimeField(blank=True, help_text='date at which this object was last updated', verbose_name='date updated')),
('filename', models.CharField(help_text='Name of the file', max_length=255, verbose_name='name')),
('extension', models.CharField(blank=True, help_text='reported filesystem extension (not mime type)', max_length=128, null=True, verbose_name='extension')),
('file', models.FileField(blank=True, help_text='actual file', null=True, upload_to='', verbose_name='file')),
('status', models.CharField(choices=[('uploading', 'uploading'), ('completed', 'completed'), ('processing', 'processing'), ('abandoned', 'abandoned')], default='uploading', help_text='upload status for file', max_length=10, verbose_name='status')),
('date_completed', models.DateTimeField(blank=True, help_text="datetime at which this file's upload was completed", null=True, verbose_name='completed on')),
('mime_type', models.CharField(blank=True, help_text='reported mime-type', max_length=128, null=True, verbose_name='mime-type')),
('file', models.FileField(blank=True, help_text='actual file', null=True, upload_to=storage.models.upload_to_fielpath, verbose_name='file')),
('status', models.CharField(choices=[('uploading', 'uploading'), ('completed', 'completed'), ('abandoned', 'abandoned'), ('error', 'error')], default='uploading', help_text='upload status for file', max_length=10, verbose_name='status')),
('datetime_completed', models.DateTimeField(blank=True, help_text="datetime at which this file's upload was completed", null=True, verbose_name='completed on')),
('expires', models.BooleanField(default=False, help_text="will be scrubbed on 'date_expires'", verbose_name='expires')),
('delete_on_expiration', models.BooleanField(default=False, help_text='will be deleted if expired and expires is true', verbose_name='delete on expiration')),
('size_on_disk', models.PositiveBigIntegerField(blank=True, help_text='total size on disk for this file', null=True, verbose_name='size on disk (bytes)')),
('size', models.PositiveBigIntegerField(blank=True, help_text='total size on disk for this file', null=True, verbose_name='size (bytes)')),
('expected_size', models.PositiveBigIntegerField(blank=True, help_text='expected file size', null=True, verbose_name='expected size (bytes)')),
('max_size_chunk_bytes', models.PositiveBigIntegerField(default=common.utils.get_max_size_chunk_bytes, help_text='max size of each individual chunk for this file', verbose_name='maximum size of chunks (bytes)')),
('last_end_bytes', models.BigIntegerField(blank=True, help_text='last uploaded bytes position', null=True, verbose_name='last end bytes')),
('owner', models.ForeignKey(blank=True, help_text='Who owns this file', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='files_owned', to=settings.AUTH_USER_MODEL, verbose_name='owner')),
],
options={
@ -39,23 +40,4 @@ class Migration(migrations.Migration):
'verbose_name_plural': 'files',
},
),
migrations.CreateModel(
name='FileChunk',
fields=[
('lid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='lockbox ID')),
('date_created', models.DateTimeField(blank=True, help_text='date at which this object was created', verbose_name='date created')),
('date_updated', models.DateTimeField(blank=True, help_text='date at which this object was last updated', verbose_name='date updated')),
('chunk', models.FileField(help_text='chunk file', upload_to=storage.models.get_upload_path_chunk, verbose_name='chunk file')),
('chunk_id', models.BigIntegerField(help_text='chunk id', verbose_name='chunk id')),
('size', models.BigIntegerField(help_text='chunk size', verbose_name='size')),
('start_bytes', models.BigIntegerField(help_text='part of file start', verbose_name='start bytes')),
('end_bytes', models.BigIntegerField(help_text='part of file end', verbose_name='end bytes')),
('file', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='chunks', to='storage.file')),
],
options={
'verbose_name': 'file chunk',
'verbose_name_plural': 'file chunks',
'unique_together': {('file', 'chunk_id')},
},
),
]

View File

@ -1,47 +1,33 @@
from datetime import timedelta
from pathlib import Path
from common.constants import UPLOAD_STATUS_TYPES
from common.constants import UPLOAD_STATUS_TYPES, UPLOAD_ERROR_CODES
from common.models import LockboxBase
from common.utils import get_config, get_max_size_chunk_bytes
from django.conf import settings
from django.core.files.uploadedfile import UploadedFile
from django.db import models, transaction
from django.utils import timezone
from django.utils.translation import gettext_lazy as _
from django.conf import settings
from hashlib import md5
def get_upload_path_chunk(instance, filename):
# TODO: How do we reconcile storage?
# TODO: Do we autodetect existing files task?
# TODO: Figure out absolute storage :(, custom storage and custom filefield? why is this not a def behaviour?
class UploadError(Exception):
filename = f"{instance.chunk_id}.chunk"
chunk_dir = settings.MEDIA_ROOT / str(instance.file.lid)
def __init__(self, *args, **kwargs):
self.code = kwargs.pop("code")
super().__init__(*args, **kwargs)
if not Path.exists(chunk_dir):
Path.mkdir(chunk_dir)
target_path = Path(chunk_dir) / Path(filename)
print(target_path)
return target_path
def upload_to_fielpath(instance, filename):
return Path(str(instance.lid)).joinpath(f"{filename}{settings.INCOMPLETE_EXT}")
class File(LockboxBase):
filename = models.CharField(
max_length=255,
null=False,
blank=False,
verbose_name = _("name"),
help_text=_("Name of the file"),
)
extension = models.CharField(
mime_type = models.CharField(
max_length=128,
blank=True,
null=True,
verbose_name=_("extension"),
help_text=_("reported filesystem extension (not mime type)"),
verbose_name=_("mime-type"),
help_text=_("reported mime-type"),
)
file = models.FileField(
@ -49,14 +35,14 @@ class File(LockboxBase):
blank=True,
verbose_name=_("file"),
help_text=_("actual file"),
upload_to=upload_to_fielpath
)
# TODO: Make this an FSM
UPLOAD_CHOICES = (
(UPLOAD_STATUS_TYPES.UPLOADING, _(UPLOAD_STATUS_TYPES.UPLOADING)),
(UPLOAD_STATUS_TYPES.COMPLETED, _(UPLOAD_STATUS_TYPES.COMPLETED)),
(UPLOAD_STATUS_TYPES.PROCESSING, _(UPLOAD_STATUS_TYPES.PROCESSING)),
(UPLOAD_STATUS_TYPES.ABANDONED, _(UPLOAD_STATUS_TYPES.ABANDONED)),
(UPLOAD_STATUS_TYPES.ERROR, _(UPLOAD_STATUS_TYPES.ERROR)),
)
status = models.CharField(
@ -69,7 +55,7 @@ class File(LockboxBase):
help_text=_("upload status for file"),
)
date_completed = models.DateTimeField(
datetime_completed = models.DateTimeField(
null=True,
blank=True,
verbose_name=_("completed on"),
@ -102,13 +88,20 @@ class File(LockboxBase):
help_text=_("will be deleted if expired and expires is true"),
)
size_on_disk = models.PositiveBigIntegerField(
size = models.PositiveBigIntegerField(
null=True,
blank=True,
verbose_name=_("size on disk (bytes)"),
verbose_name=_("size (bytes)"),
help_text=_("total size on disk for this file"),
)
expected_size = models.PositiveBigIntegerField(
null=True,
blank=True,
verbose_name=_("expected size (bytes)"),
help_text=_("expected file size"),
)
max_size_chunk_bytes = models.PositiveBigIntegerField(
null=False,
blank=False,
@ -117,157 +110,122 @@ class File(LockboxBase):
help_text=_("max size of each individual chunk for this file"),
)
last_end_bytes = models.BigIntegerField(
null=True,
blank=True,
verbose_name=("last end bytes"),
help_text=_("last uploaded bytes position"),
)
readonly_fields = [
"extension",
"mime_type",
"status",
"date_completed",
"size_on_disk",
"datetime_completed",
"size",
"file",
"max_size_chunk_bytes",
"last_end_bytes",
*LockboxBase.readonly_fields,
]
def __str__(self):
return self.filename
name = "NO NAME"
if self.file:
name = self.file.name
return f"{name} ({self.lid})"
class Meta:
verbose_name = _("file")
verbose_name_plural = _("files")
@property
def checksum(self):
return 0
@property
def date_expires(self):
return self.date_created + timedelta(minutes=get_config("EXPIRATION_DELTA_MINUTES"))
def md5(self):
if self.exists:
return md5(open(self.file, "rb").read()).hexdigest()
return None
@property
def abandoned(self):
return self.date_created + timedelta(minutes=get_config("ABANDONED_DELTA_MINUTES"))
return self.date_created <= timezone.now() + timedelta(minutes=get_config("ABANDONED_DELTA_MINUTES"))
@classmethod
def abandoned_condition():
return models.Q(date_created__lte=timezone.now() + timedelta(minutes=get_config("ABANDONED_DELTA_MINUTES")))
@property
def expired(self):
return self.date_expires <= timezone.now()
@classmethod
def expired_conditon():
return models.Q(date_expires__lte=timezone.now())
@property
def last_chunk_id(self):
last_chunk_id = self.chunks.order_by("-chunk_id").values("chunk_id").first()
if last_chunk_id:
return last_chunk_id.get("chunk_id")
return - 1
def exists(self):
if not self.file:
return False
return Path(self.file.path).is_file()
def create_chunk(self, chunk_file, chunk_data):
chunk = FileChunk(
file=self,
chunk=chunk_file,
chunk_id=self.last_chunk_id,
**chunk_data
)
def append_chunk(self, chunk_file, chunk_data):
# Override in case recently abandoned
# Will persist if it does not error out.
self.status = UPLOAD_STATUS_TYPES.UPLOADING
chunk.save()
return chunk
# Do not rely on DB file state, check for actual file.
if not self.exists:
# Oh oh, we are uploading a n + 1 chunk but theres no file
if chunk_data["start_bytes"] != 0:
self.status = UPLOAD_STATUS_TYPES.ERROR
self.save()
raise UploadError("File for uploaded chunk no longer exists", code=UPLOAD_ERROR_CODES.FILE_MISSING)
if self.last_end_bytes and self.last_end_bytes + 1 != chunk_data["start_bytes"]:
# Client screwed up, this is not where we left
raise UploadError("Mismatch in expected chunk", code=UPLOAD_ERROR_CODES.CHUNK_MISMATCH)
self.last_end_bytes = chunk_data["end_bytes"]
if self.expected_size == self.last_end_bytes:
# File is one shot chunk.
if chunk_data["start_bytes"] == 0:
self.file = chunk_file
self.save()
self.finalize()
return
# This is an n + 1 chunk.
print("Appending bytes yo")
chunk_file.seek(0)
self.save()
def finalize(self):
self.refresh_from_db()
self.status = UPLOAD_STATUS_TYPES.COMPLETED
self.datetime_completed = timezone.now()
final_name = self.file.name.replace(settings.INCOMPLETE_EXT, "")
final_path = settings.MEDIA_ROOT / final_name
with transaction.atomic():
Path(self.file.path).rename(final_path)
self.file.name = final_name
self.save()
def save(self, *args, **kwargs):
if not self.max_size_chunk_bytes:
self.max_size_chunk_bytes = get_max_size_chunk_bytes()
return super().save(*args, **kwargs)
def delete(self, *args, **kwargs):
if self.file:
storage, path = self.file.storage, self.file.path
if self.file:
# TODO: Figure out if file exists and try to delete it if error, report error.
storage.delete(path)
with transaction.atomic():
self.chunks.all().delete()
if self.file:
if Path(self.file.path).is_file():
self.file.storage.delete(self.file.path)
self.file.storage.delete(Path(self.file.path).parent)
result = super().delete(*args, **kwargs)
return result
def handler_bytes(self):
# TODO: This is a naive approach, we almost never want to do this.
self.file.close()
self.file.open(mode="rb")
return UploadedFile(file=self.file, name=self.filename, size=self.offset)
class FileChunk(LockboxBase):
file = models.ForeignKey(
"storage.File",
null=False,
blank=False,
on_delete=models.CASCADE,
related_name="chunks",
)
chunk = models.FileField(
upload_to=get_upload_path_chunk,
null=False,
blank=False,
verbose_name=_("chunk file"),
help_text=_("chunk file"),
)
chunk_id = models.BigIntegerField(
null=False,
blank=False,
verbose_name=_("chunk id"),
help_text=_("chunk id"),
)
size = models.BigIntegerField(
null=False,
blank=False,
verbose_name=("size"),
help_text=_("chunk size"),
)
start_bytes = models.BigIntegerField(
null=False,
blank=False,
verbose_name=("start bytes"),
help_text=_("part of file start"),
)
end_bytes = models.BigIntegerField(
null=False,
blank=False,
verbose_name=("end bytes"),
help_text=_("part of file end"),
)
readonly_fields = [
"file",
"chunk_id",
"start",
"end",
"size",
*LockboxBase.readonly_fields,
]
def __str__(self):
return f"{self.file.filename}.{self.chunk_id}.chunk"
class Meta:
verbose_name = _("file chunk")
verbose_name_plural = _("file chunks")
unique_together = ("file", "chunk_id")
def save(self, *args, **kwargs):
self.chunk_id = self.file.last_chunk_id + 1
return super().save(*args, **kwargs)
def delete(self, *args, **kwargs):
if self.file:
storage, path = self.file.storage, self.file.path
if self.file:
# TODO: Figure out if file exists and try to delete it if error, report error.
storage.delete(path)
return super().delete(*args, **kwargs)
# class FileShare(LockboxBase):
# file = models.ForeignKey(

View File

@ -1,6 +1,6 @@
from rest_framework import serializers
from storage.models import File, FileChunk
from storage.models import File
class FileSerializer(serializers.ModelSerializer):
@ -9,21 +9,3 @@ class FileSerializer(serializers.ModelSerializer):
model = File
fields = "__all__"
read_only_fields = File.readonly_fields
class FileChunkSerializer(serializers.ModelSerializer):
class Meta:
model = FileChunk
fields = "__all__"
read_only_fields = FileChunk.readonly_fields
def validate(self, data):
data = super().validate(data)
file = File.objects.get(lid=data["file"])
if data["size"] > file.max_size_chunk_bytes:
detail = f"'size' param is larger than max chunk size for file:\
{data["size"]} > {file.max_size_chunk_bytes}"
raise serializers.ValidationError(detail)
return data

View File

@ -7,11 +7,7 @@ from storage import views_api, views_client
router = SimpleRouter()
router.register(r'files', views_api.FileModelViewSet)
chunk_router = NestedSimpleRouter(router, r'files', lookup="file")
chunk_router.register(r'chunks', views_api.FileChunkViewSet, basename="file-chunks")
urlpatterns = [
path("api/", include(router.urls)),
path("api/", include(chunk_router.urls)),
path("upload/", views_client.FileUploadView.as_view(), name="client-fileupload"),
]

View File

@ -8,12 +8,14 @@ from rest_framework import status
from rest_framework.decorators import action
from rest_framework.response import Response
from rest_framework.viewsets import ModelViewSet
from rest_framework.exceptions import NotFound, ValidationError
from rest_framework.exceptions import NotFound
from rest_framework.exceptions import ValidationError as UserValidationError
from rest_framework.parsers import FileUploadParser
# from user.models import LockboxUser
from storage.models import File, FileChunk
from storage.serializers import FileChunkSerializer, FileSerializer
from django.core.exceptions import ValidationError
from storage.models import File, UploadError
from storage.serializers import FileSerializer
class FileModelViewSet(ModelViewSet):
@ -21,48 +23,38 @@ class FileModelViewSet(ModelViewSet):
queryset = File.objects.all()
serializer_class = FileSerializer
@action(detail=True, methods=["GET"])
def last_chunk_position(self, request, pk=None):
file = self.get_object()
last_chunk_id = file.last_chunk_id
last_postion = 0
if last_chunk_id != -1:
last_chunk = self.chunks.order_by("-chunk_id").values("end_bytes").first()
if last_chunk:
last_postion = last_chunk_id.get("end_bytes")
return Response({"last_chunk_position": last_postion}, status=status.HTTP_200_OK)
@action(detail=True, methods=["PUT"])
def append_chunk(self, request, filename="DUMMY", format=None, pk=None):
try:
file = File.objects.filter(lid=pk).first()
except ValidationError:
raise UserValidationError(f"UUID {pk} is not a valid UUID")
class FileChunkViewSet(ModelViewSet):
model = FileChunk
queryset = FileChunk.objects.all()
serializer_class = FileChunkSerializer
parser_classes = (FileUploadParser,)
def create(self, request, filename="DUMMY", format=None, file_pk=None):
file = File.objects.filter(lid=str(file_pk)).first()
if not file:
raise NotFound(f"File with ID {file_pk} was not found")
raise NotFound(f"File with ID {pk} was not found")
chunk_data = self.get_content_range(request)
if not chunk_data:
raise ValidationError(
raise UserValidationError(
f"Missing content range headers"
)
chunk_file = request.FILES["file"]
chunk_file = request.FILES["Content"]
if chunk_file.size > file.max_size_chunk_bytes:
raise ValidationError(
raise UserValidationError(
f"Chunk size is greater than files max chunk size: {chunk_file.size} > {file.max_size_chunk_bytes}")
range_size = chunk_data["end_bytes"] - chunk_data["start_bytes"]
if chunk_file.size != range_size:
raise ValidationError(
raise UserValidationError(
f"Actual chunk size mismatches content-range header: {chunk_file.size} != {range_size}"
)
chunk_data["size"] = chunk_file.size
file.create_chunk(chunk_file=chunk_file, chunk_data=chunk_data)
try:
file.append_chunk(chunk_file, chunk_data)
except UploadError as e:
return Response({"code": e.code}, status=status.HTTP_400_BAD_REQUEST)
return Response(status=status.HTTP_201_CREATED)
def get_content_range(self, request):