Skip to content

Commit cfc22b4

Browse files
committed
Issue #15958: bytes.join and bytearray.join now accept arbitrary buffer objects.
1 parent 6ca07a2 commit cfc22b4

File tree

5 files changed

+167
-160
lines changed

5 files changed

+167
-160
lines changed

Lib/test/test_bytes.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -288,8 +288,22 @@ def test_join(self):
288288
self.assertEqual(self.type2test(b"").join(lst), b"abc")
289289
self.assertEqual(self.type2test(b"").join(tuple(lst)), b"abc")
290290
self.assertEqual(self.type2test(b"").join(iter(lst)), b"abc")
291-
self.assertEqual(self.type2test(b".").join([b"ab", b"cd"]), b"ab.cd")
292-
# XXX more...
291+
dot_join = self.type2test(b".:").join
292+
self.assertEqual(dot_join([b"ab", b"cd"]), b"ab.:cd")
293+
self.assertEqual(dot_join([memoryview(b"ab"), b"cd"]), b"ab.:cd")
294+
self.assertEqual(dot_join([b"ab", memoryview(b"cd")]), b"ab.:cd")
295+
self.assertEqual(dot_join([bytearray(b"ab"), b"cd"]), b"ab.:cd")
296+
self.assertEqual(dot_join([b"ab", bytearray(b"cd")]), b"ab.:cd")
297+
# Stress it with many items
298+
seq = [b"abc"] * 1000
299+
expected = b"abc" + b".:abc" * 999
300+
self.assertEqual(dot_join(seq), expected)
301+
# Error handling and cleanup when some item in the middle of the
302+
# sequence has the wrong type.
303+
with self.assertRaises(TypeError):
304+
dot_join([bytearray(b"ab"), "cd", b"ef"])
305+
with self.assertRaises(TypeError):
306+
dot_join([memoryview(b"ab"), "cd", b"ef"])
293307

294308
def test_count(self):
295309
b = self.type2test(b'mississippi')
@@ -1249,6 +1263,11 @@ def test_returns_new_copy(self):
12491263
self.assertEqual(val, newval)
12501264
self.assertTrue(val is not newval,
12511265
expr+' returned val on a mutable object')
1266+
sep = self.marshal(b'')
1267+
newval = sep.join([val])
1268+
self.assertEqual(val, newval)
1269+
self.assertIsNot(val, newval)
1270+
12521271

12531272
class FixedStringTest(test.string_tests.BaseTest):
12541273

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ What's New in Python 3.4.0 Alpha 1?
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #15958: bytes.join and bytearray.join now accept arbitrary buffer
14+
objects.
15+
1316
- Issue #14783: Improve int() docstring and switch docstrings for str(),
1417
range(), and slice() to use multi-line signatures.
1518

Objects/bytearrayobject.c

Lines changed: 5 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,7 @@ bytearray_dealloc(PyByteArrayObject *self)
10321032
#define FASTSEARCH fastsearch
10331033
#define STRINGLIB(F) stringlib_##F
10341034
#define STRINGLIB_CHAR char
1035+
#define STRINGLIB_SIZEOF_CHAR 1
10351036
#define STRINGLIB_LEN PyByteArray_GET_SIZE
10361037
#define STRINGLIB_STR PyByteArray_AS_STRING
10371038
#define STRINGLIB_NEW PyByteArray_FromStringAndSize
@@ -1043,6 +1044,7 @@ bytearray_dealloc(PyByteArrayObject *self)
10431044
#include "stringlib/fastsearch.h"
10441045
#include "stringlib/count.h"
10451046
#include "stringlib/find.h"
1047+
#include "stringlib/join.h"
10461048
#include "stringlib/partition.h"
10471049
#include "stringlib/split.h"
10481050
#include "stringlib/ctype.h"
@@ -2569,73 +2571,9 @@ Concatenate any number of bytes/bytearray objects, with B\n\
25692571
in between each pair, and return the result as a new bytearray.");
25702572

25712573
static PyObject *
2572-
bytearray_join(PyByteArrayObject *self, PyObject *it)
2573-
{
2574-
PyObject *seq;
2575-
Py_ssize_t mysize = Py_SIZE(self);
2576-
Py_ssize_t i;
2577-
Py_ssize_t n;
2578-
PyObject **items;
2579-
Py_ssize_t totalsize = 0;
2580-
PyObject *result;
2581-
char *dest;
2582-
2583-
seq = PySequence_Fast(it, "can only join an iterable");
2584-
if (seq == NULL)
2585-
return NULL;
2586-
n = PySequence_Fast_GET_SIZE(seq);
2587-
items = PySequence_Fast_ITEMS(seq);
2588-
2589-
/* Compute the total size, and check that they are all bytes */
2590-
/* XXX Shouldn't we use _getbuffer() on these items instead? */
2591-
for (i = 0; i < n; i++) {
2592-
PyObject *obj = items[i];
2593-
if (!PyByteArray_Check(obj) && !PyBytes_Check(obj)) {
2594-
PyErr_Format(PyExc_TypeError,
2595-
"can only join an iterable of bytes "
2596-
"(item %ld has type '%.100s')",
2597-
/* XXX %ld isn't right on Win64 */
2598-
(long)i, Py_TYPE(obj)->tp_name);
2599-
goto error;
2600-
}
2601-
if (i > 0)
2602-
totalsize += mysize;
2603-
totalsize += Py_SIZE(obj);
2604-
if (totalsize < 0) {
2605-
PyErr_NoMemory();
2606-
goto error;
2607-
}
2608-
}
2609-
2610-
/* Allocate the result, and copy the bytes */
2611-
result = PyByteArray_FromStringAndSize(NULL, totalsize);
2612-
if (result == NULL)
2613-
goto error;
2614-
dest = PyByteArray_AS_STRING(result);
2615-
for (i = 0; i < n; i++) {
2616-
PyObject *obj = items[i];
2617-
Py_ssize_t size = Py_SIZE(obj);
2618-
char *buf;
2619-
if (PyByteArray_Check(obj))
2620-
buf = PyByteArray_AS_STRING(obj);
2621-
else
2622-
buf = PyBytes_AS_STRING(obj);
2623-
if (i) {
2624-
memcpy(dest, self->ob_bytes, mysize);
2625-
dest += mysize;
2626-
}
2627-
memcpy(dest, buf, size);
2628-
dest += size;
2629-
}
2630-
2631-
/* Done */
2632-
Py_DECREF(seq);
2633-
return result;
2634-
2635-
/* Error handling */
2636-
error:
2637-
Py_DECREF(seq);
2638-
return NULL;
2574+
bytearray_join(PyObject *self, PyObject *iterable)
2575+
{
2576+
return stringlib_bytes_join(self, iterable);
26392577
}
26402578

26412579
PyDoc_STRVAR(splitlines__doc__,

Objects/bytesobject.c

Lines changed: 16 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -10,17 +10,26 @@
1010
static Py_ssize_t
1111
_getbuffer(PyObject *obj, Py_buffer *view)
1212
{
13-
PyBufferProcs *buffer = Py_TYPE(obj)->tp_as_buffer;
14-
15-
if (buffer == NULL || buffer->bf_getbuffer == NULL)
13+
PyBufferProcs *bufferprocs;
14+
if (PyBytes_CheckExact(obj)) {
15+
/* Fast path, e.g. for .join() of many bytes objects */
16+
Py_INCREF(obj);
17+
view->obj = obj;
18+
view->buf = PyBytes_AS_STRING(obj);
19+
view->len = PyBytes_GET_SIZE(obj);
20+
return view->len;
21+
}
22+
23+
bufferprocs = Py_TYPE(obj)->tp_as_buffer;
24+
if (bufferprocs == NULL || bufferprocs->bf_getbuffer == NULL)
1625
{
1726
PyErr_Format(PyExc_TypeError,
1827
"Type %.100s doesn't support the buffer API",
1928
Py_TYPE(obj)->tp_name);
2029
return -1;
2130
}
2231

23-
if (buffer->bf_getbuffer(obj, view, PyBUF_SIMPLE) < 0)
32+
if (bufferprocs->bf_getbuffer(obj, view, PyBUF_SIMPLE) < 0)
2433
return -1;
2534
return view->len;
2635
}
@@ -555,6 +564,7 @@ PyBytes_AsStringAndSize(register PyObject *obj,
555564
#include "stringlib/fastsearch.h"
556565
#include "stringlib/count.h"
557566
#include "stringlib/find.h"
567+
#include "stringlib/join.h"
558568
#include "stringlib/partition.h"
559569
#include "stringlib/split.h"
560570
#include "stringlib/ctype.h"
@@ -1107,94 +1117,9 @@ Concatenate any number of bytes objects, with B in between each pair.\n\
11071117
Example: b'.'.join([b'ab', b'pq', b'rs']) -> b'ab.pq.rs'.");
11081118

11091119
static PyObject *
1110-
bytes_join(PyObject *self, PyObject *orig)
1120+
bytes_join(PyObject *self, PyObject *iterable)
11111121
{
1112-
char *sep = PyBytes_AS_STRING(self);
1113-
const Py_ssize_t seplen = PyBytes_GET_SIZE(self);
1114-
PyObject *res = NULL;
1115-
char *p;
1116-
Py_ssize_t seqlen = 0;
1117-
size_t sz = 0;
1118-
Py_ssize_t i;
1119-
PyObject *seq, *item;
1120-
1121-
seq = PySequence_Fast(orig, "");
1122-
if (seq == NULL) {
1123-
return NULL;
1124-
}
1125-
1126-
seqlen = PySequence_Size(seq);
1127-
if (seqlen == 0) {
1128-
Py_DECREF(seq);
1129-
return PyBytes_FromString("");
1130-
}
1131-
if (seqlen == 1) {
1132-
item = PySequence_Fast_GET_ITEM(seq, 0);
1133-
if (PyBytes_CheckExact(item)) {
1134-
Py_INCREF(item);
1135-
Py_DECREF(seq);
1136-
return item;
1137-
}
1138-
}
1139-
1140-
/* There are at least two things to join, or else we have a subclass
1141-
* of the builtin types in the sequence.
1142-
* Do a pre-pass to figure out the total amount of space we'll
1143-
* need (sz), and see whether all argument are bytes.
1144-
*/
1145-
/* XXX Shouldn't we use _getbuffer() on these items instead? */
1146-
for (i = 0; i < seqlen; i++) {
1147-
const size_t old_sz = sz;
1148-
item = PySequence_Fast_GET_ITEM(seq, i);
1149-
if (!PyBytes_Check(item) && !PyByteArray_Check(item)) {
1150-
PyErr_Format(PyExc_TypeError,
1151-
"sequence item %zd: expected bytes,"
1152-
" %.80s found",
1153-
i, Py_TYPE(item)->tp_name);
1154-
Py_DECREF(seq);
1155-
return NULL;
1156-
}
1157-
sz += Py_SIZE(item);
1158-
if (i != 0)
1159-
sz += seplen;
1160-
if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
1161-
PyErr_SetString(PyExc_OverflowError,
1162-
"join() result is too long for bytes");
1163-
Py_DECREF(seq);
1164-
return NULL;
1165-
}
1166-
}
1167-
1168-
/* Allocate result space. */
1169-
res = PyBytes_FromStringAndSize((char*)NULL, sz);
1170-
if (res == NULL) {
1171-
Py_DECREF(seq);
1172-
return NULL;
1173-
}
1174-
1175-
/* Catenate everything. */
1176-
/* I'm not worried about a PyByteArray item growing because there's
1177-
nowhere in this function where we release the GIL. */
1178-
p = PyBytes_AS_STRING(res);
1179-
for (i = 0; i < seqlen; ++i) {
1180-
size_t n;
1181-
char *q;
1182-
if (i) {
1183-
Py_MEMCPY(p, sep, seplen);
1184-
p += seplen;
1185-
}
1186-
item = PySequence_Fast_GET_ITEM(seq, i);
1187-
n = Py_SIZE(item);
1188-
if (PyBytes_Check(item))
1189-
q = PyBytes_AS_STRING(item);
1190-
else
1191-
q = PyByteArray_AS_STRING(item);
1192-
Py_MEMCPY(p, q, n);
1193-
p += n;
1194-
}
1195-
1196-
Py_DECREF(seq);
1197-
return res;
1122+
return stringlib_bytes_join(self, iterable);
11981123
}
11991124

12001125
PyObject *

Objects/stringlib/join.h

Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
/* stringlib: bytes joining implementation */
2+
3+
#if STRINGLIB_SIZEOF_CHAR != 1
4+
#error join.h only compatible with byte-wise strings
5+
#endif
6+
7+
Py_LOCAL_INLINE(PyObject *)
8+
STRINGLIB(bytes_join)(PyObject *sep, PyObject *iterable)
9+
{
10+
char *sepstr = STRINGLIB_STR(sep);
11+
const Py_ssize_t seplen = STRINGLIB_LEN(sep);
12+
PyObject *res = NULL;
13+
char *p;
14+
Py_ssize_t seqlen = 0;
15+
Py_ssize_t sz = 0;
16+
Py_ssize_t i, nbufs;
17+
PyObject *seq, *item;
18+
Py_buffer *buffers = NULL;
19+
#define NB_STATIC_BUFFERS 10
20+
Py_buffer static_buffers[NB_STATIC_BUFFERS];
21+
22+
seq = PySequence_Fast(iterable, "can only join an iterable");
23+
if (seq == NULL) {
24+
return NULL;
25+
}
26+
27+
seqlen = PySequence_Fast_GET_SIZE(seq);
28+
if (seqlen == 0) {
29+
Py_DECREF(seq);
30+
return STRINGLIB_NEW(NULL, 0);
31+
}
32+
#ifndef STRINGLIB_MUTABLE
33+
if (seqlen == 1) {
34+
item = PySequence_Fast_GET_ITEM(seq, 0);
35+
if (STRINGLIB_CHECK_EXACT(item)) {
36+
Py_INCREF(item);
37+
Py_DECREF(seq);
38+
return item;
39+
}
40+
}
41+
#endif
42+
if (seqlen > NB_STATIC_BUFFERS) {
43+
buffers = PyMem_NEW(Py_buffer, seqlen);
44+
if (buffers == NULL) {
45+
Py_DECREF(seq);
46+
return NULL;
47+
}
48+
}
49+
else {
50+
buffers = static_buffers;
51+
}
52+
53+
/* Here is the general case. Do a pre-pass to figure out the total
54+
* amount of space we'll need (sz), and see whether all arguments are
55+
* buffer-compatible.
56+
*/
57+
for (i = 0, nbufs = 0; i < seqlen; i++) {
58+
Py_ssize_t itemlen;
59+
item = PySequence_Fast_GET_ITEM(seq, i);
60+
if (_getbuffer(item, &buffers[i]) < 0) {
61+
PyErr_Format(PyExc_TypeError,
62+
"sequence item %zd: expected bytes, bytearray, "
63+
"or an object with the buffer interface, %.80s found",
64+
i, Py_TYPE(item)->tp_name);
65+
goto error;
66+
}
67+
nbufs = i + 1; /* for error cleanup */
68+
itemlen = buffers[i].len;
69+
if (itemlen > PY_SSIZE_T_MAX - sz) {
70+
PyErr_SetString(PyExc_OverflowError,
71+
"join() result is too long");
72+
goto error;
73+
}
74+
sz += itemlen;
75+
if (i != 0) {
76+
if (seplen > PY_SSIZE_T_MAX - sz) {
77+
PyErr_SetString(PyExc_OverflowError,
78+
"join() result is too long");
79+
goto error;
80+
}
81+
sz += seplen;
82+
}
83+
if (seqlen != PySequence_Fast_GET_SIZE(seq)) {
84+
PyErr_SetString(PyExc_RuntimeError,
85+
"sequence changed size during iteration");
86+
goto error;
87+
}
88+
}
89+
90+
/* Allocate result space. */
91+
res = STRINGLIB_NEW(NULL, sz);
92+
if (res == NULL)
93+
goto error;
94+
95+
/* Catenate everything. */
96+
p = STRINGLIB_STR(res);
97+
for (i = 0; i < nbufs; i++) {
98+
Py_ssize_t n;
99+
char *q;
100+
if (i) {
101+
Py_MEMCPY(p, sepstr, seplen);
102+
p += seplen;
103+
}
104+
n = buffers[i].len;
105+
q = buffers[i].buf;
106+
Py_MEMCPY(p, q, n);
107+
p += n;
108+
}
109+
goto done;
110+
111+
error:
112+
res = NULL;
113+
done:
114+
Py_DECREF(seq);
115+
for (i = 0; i < nbufs; i++)
116+
PyBuffer_Release(&buffers[i]);
117+
if (buffers != static_buffers)
118+
PyMem_FREE(buffers);
119+
return res;
120+
}
121+
122+
#undef NB_STATIC_BUFFERS

0 commit comments

Comments
 (0)