Skip to content

Commit ecef20c

Browse files
authored
Merge pull request #276 from wahtari/copy_file_range
Added copy_file_range for faster file copy in mkcomposefs.
2 parents e78c7a4 + 3e38d73 commit ecef20c

File tree

1 file changed

+118
-14
lines changed

1 file changed

+118
-14
lines changed

tools/mkcomposefs.c

Lines changed: 118 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -782,9 +782,27 @@ static int write_to_fd(int fd, const char *content, ssize_t len)
782782

783783
return 0;
784784
}
785+
static pthread_mutex_t mutex_thread_access = PTHREAD_MUTEX_INITIALIZER;
786+
static bool try_copy_file_range = true;
787+
static bool is_copy_file_range_available(void)
788+
{
789+
bool ret = true;
790+
pthread_mutex_lock(&mutex_thread_access);
791+
ret = try_copy_file_range;
792+
pthread_mutex_unlock(&mutex_thread_access);
793+
794+
return ret;
795+
}
796+
797+
static void disable_copy_file_range(void)
798+
{
799+
pthread_mutex_lock(&mutex_thread_access);
800+
try_copy_file_range = false;
801+
pthread_mutex_unlock(&mutex_thread_access);
802+
}
785803

786804
#define BUFSIZE 8192
787-
static int copy_file_data(int sfd, int dfd)
805+
static int copy_file_data_classic(int sfd, int dfd)
788806
{
789807
char buffer[BUFSIZE];
790808
ssize_t bytes_read;
@@ -807,6 +825,97 @@ static int copy_file_data(int sfd, int dfd)
807825
return 0;
808826
}
809827

828+
static int copy_file_data_range(int sfd, int dfd)
829+
{
830+
struct stat stat;
831+
832+
if (fstat(sfd, &stat) == -1)
833+
return -1;
834+
835+
off_t len, ret;
836+
len = stat.st_size;
837+
838+
if (len == 0)
839+
return 0;
840+
841+
do {
842+
ret = copy_file_range(sfd, NULL, dfd, NULL, len, 0);
843+
if (ret < 0 && errno == EINTR)
844+
continue;
845+
if (ret == -1)
846+
return -1;
847+
// This is an implementation problem in copy_file_range. Handle it and return error so that classic copy can be retried
848+
if (ret == 0 && len > 0) {
849+
// Setting this error code to trigger a classic copy
850+
// https://github.com/rust-lang/rust/blob/0e5f5207881066973486e6a480fa46cfa22947e9/library/std/src/sys/pal/unix/kernel_copy.rs#L622
851+
// fallback to work around several kernel bugs where copy_file_range will fail to
852+
// copy any bytes and return 0 instead of an error if
853+
// - reading virtual files from the proc filesystem which appear to have 0 size
854+
// but are not empty. noted in coreutils to affect kernels at least up to 5.6.19.
855+
// - copying from an overlay filesystem in docker. reported to occur on fedora 32.
856+
errno = EINVAL; // EINVAL Either fd_in or fd_out is not a regular file.
857+
return -1;
858+
}
859+
if (ret == 0)
860+
break;
861+
862+
len -= ret;
863+
} while (len > 0 && ret > 0);
864+
865+
return 0;
866+
}
867+
868+
static int copy_file_data(int sfd, int dfd)
869+
{
870+
bool use_copy_classic = !is_copy_file_range_available();
871+
// https://github.com/rust-lang/rust/blob/0e5f5207881066973486e6a480fa46cfa22947e9/library/std/src/sys/pal/unix/kernel_copy.rs#L622
872+
// https://gitlab.gnome.org/GNOME/libglnx/-/blob/202b294e6079e23242e65e0426f8639841d1210b/glnx-fdio.c#L846
873+
// https://github.com/systemd/systemd/blob/e71b40fd0026c0884ca26eb4f0a9fbe4d9285cfa/src/shared/copy.c#L338
874+
// https://lwn.net/Articles/846403/
875+
int ret = -1;
876+
if (!use_copy_classic) {
877+
ret = copy_file_data_range(sfd, dfd);
878+
// Write was successful
879+
if (0 == ret)
880+
return 0;
881+
882+
// https://github.com/rust-lang/rust/blob/0e5f5207881066973486e6a480fa46cfa22947e9/library/std/src/sys/pal/unix/kernel_copy.rs#L622
883+
// Try fallback io::copy if either:
884+
// - Kernel version is < 4.5 (ENOSYS¹)
885+
// - Files are mounted on different fs (EXDEV)
886+
// - copy_file_range is broken in various ways on RHEL/CentOS 7 (EOPNOTSUPP)
887+
// - copy_file_range file is immutable or syscall is blocked by seccomp¹ (EPERM)
888+
// - copy_file_range cannot be used with pipes or device nodes (EINVAL)
889+
// - the writer fd was opened with O_APPEND (EBADF²)
890+
// and no bytes were written successfully yet. (All these errnos should
891+
// not be returned if something was already written, but they happen in
892+
// the wild, see #91152.)
893+
//
894+
// ¹ these cases should be detected by the initial probe but we handle them here
895+
// anyway in case syscall interception changes during runtime
896+
// ² actually invalid file descriptors would cause this too, but in that case
897+
// the fallback code path is expected to encounter the same error again
898+
899+
// Disable copy file range for the entire run because,
900+
// the rest of the files as part of this run will also have the similar file system.
901+
if (ret < 0 && (errno == ENOSYS || errno == EXDEV)) {
902+
disable_copy_file_range();
903+
use_copy_classic = true;
904+
}
905+
906+
// Try classic for this file but copy_file_range could work for the next file.
907+
if (ret < 0 && (errno == EOPNOTSUPP || errno == EPERM ||
908+
errno == EINVAL || errno == EBADF)) {
909+
use_copy_classic = true;
910+
}
911+
}
912+
913+
if (use_copy_classic) {
914+
ret = copy_file_data_classic(sfd, dfd);
915+
}
916+
return ret;
917+
}
918+
810919
static int copy_file_with_dirs_if_needed(const char *src, const char *dst_base,
811920
const char *dst, bool try_enable_fsverity)
812921
{
@@ -1020,7 +1129,7 @@ static int construct_compute_data(struct lcfs_node_s *node,
10201129
}
10211130

10221131
struct work_item_iterator {
1023-
pthread_mutex_t mutex_node_iterator;
1132+
pthread_mutex_t *mutex_node_iterator;
10241133
int current_item;
10251134
int errorcode;
10261135
bool cancel_request;
@@ -1035,26 +1144,26 @@ static struct work_item *get_next_work_item(struct work_collection *collection,
10351144
bool cancel = false;
10361145
struct work_item *ret = NULL;
10371146

1038-
pthread_mutex_lock(&(iterator->mutex_node_iterator));
1147+
pthread_mutex_lock(iterator->mutex_node_iterator);
10391148
if (iterator->cancel_request)
10401149
cancel = true;
10411150
else if (iterator->current_item < collection->count) {
10421151
ret = &(collection->items[iterator->current_item]);
10431152
iterator->current_item++;
10441153
}
1045-
pthread_mutex_unlock(&(iterator->mutex_node_iterator));
1154+
pthread_mutex_unlock(iterator->mutex_node_iterator);
10461155
return cancel ? NULL : ret;
10471156
}
10481157

10491158
static void request_cancel(struct work_item_iterator *iterator, int errorcode)
10501159
{
1051-
pthread_mutex_lock(&(iterator->mutex_node_iterator));
1160+
pthread_mutex_lock(iterator->mutex_node_iterator);
10521161
// Record only the first cancels error code
10531162
if (!iterator->cancel_request) {
10541163
iterator->cancel_request = true;
10551164
iterator->errorcode = errorcode;
10561165
}
1057-
pthread_mutex_unlock(&(iterator->mutex_node_iterator));
1166+
pthread_mutex_unlock(iterator->mutex_node_iterator);
10581167
}
10591168

10601169
typedef int (*THREAD_PROCESS_PROC)(struct work_item *, void *);
@@ -1109,12 +1218,7 @@ static int execute_in_threads(const int requested_threads,
11091218
THREAD_PROCESS_PROC proc, void *data)
11101219
{
11111220
struct work_item_iterator iterator;
1112-
int ret = pthread_mutex_init(&iterator.mutex_node_iterator, NULL);
1113-
if (0 != ret) {
1114-
errno = ret;
1115-
return -1;
1116-
}
1117-
1221+
iterator.mutex_node_iterator = &mutex_thread_access;
11181222
iterator.current_item = 0;
11191223
iterator.errorcode = 0;
11201224
iterator.cancel_request = false;
@@ -1125,6 +1229,7 @@ static int execute_in_threads(const int requested_threads,
11251229
thread_info.collection = collection;
11261230
thread_info.iterator = &iterator;
11271231

1232+
int ret = -1;
11281233
cleanup_free pthread_t *threads = NULL;
11291234
const int thread_count = requested_threads - 1;
11301235
if (thread_count >= 1) {
@@ -1201,7 +1306,6 @@ static int fill_store(const int thread_count, struct lcfs_node_s *node,
12011306
int ret = execute_in_threads(thread_count, &collection, process_copy,
12021307
(void *)digest_store_path);
12031308
cleanup_work_items(&collection);
1204-
12051309
return ret;
12061310
}
12071311

@@ -1239,7 +1343,7 @@ static void usage(const char *argv0)
12391343
" --from-file The source is a dump file, not a directory\n"
12401344
" --min-version=N Use this minimal format version (default=%d)\n"
12411345
" --max-version=N Use this maxium format version (default=%d)\n"
1242-
" --threads=N Use this to calculate digest and copy files in threads (default=%d)\n",
1346+
" --threads=N Use this to override the default number of threads used to calculate digest and copy files (default=%d)\n",
12431347
bin, LCFS_DEFAULT_VERSION_MIN, LCFS_DEFAULT_VERSION_MAX,
12441348
get_cpu_count());
12451349
}

0 commit comments

Comments
 (0)