Skip to content

Improve mark_ends #1034

@pochmann3

Description

@pochmann3

The current implementation:

def mark_ends(iterable):
    it = iter(iterable)

    try:
        b = next(it)
    except StopIteration:
        return

    try:
        for i in count():
            a = b
            b = next(it)
            yield i == 0, False, a

    except StopIteration:
        yield i == 0, True, a

It's a bit odd and slow. Instead of looping over count() and picking from the input iterator with next() in a try, I'd loop over the input iterator. And instead of producing ints only to compare them to zero, I'd use a bool flag:

def mark_ends__improved(iterable):
    it = iter(iterable)

    try:
        a = next(it)
        first = True
    except StopIteration:
        return

    for b in it:
        yield first, False, a
        a = b
        first = False

    yield first, True, a

Or with "my style" of preferring for statements:

def mark_ends__improved_mystyle(iterable):
    it = iter(iterable)
    for a in it:
        first = True
        for b in it:
            yield first, False, a
            a = b
            first = False
        yield first, True, a

These are almost the fastest of all my solutions. I also have some shorter ones (including a two-liner). All solutions and the benchmark code are at the end.

Image
Script with all solutions and benchmark
def mark_ends__shortest(iterable):
    for i, (a, b) in enumerate(pairwise(chain(iterable, [_marker]))):
        yield not i, b is _marker, a


def mark_ends__flag_pairs(iterable):
    first = True
    for a, b in pairwise(chain(iterable, [_marker])):
        yield first, b is _marker, a
        first = False


def mark_ends__improved(iterable):
    it = iter(iterable)

    try:
        a = next(it)
        first = True
    except StopIteration:
        return

    for b in it:
        yield first, False, a
        a = b
        first = False

    yield first, True, a


def mark_ends__impr_mystyle(iterable):
    it = iter(iterable)
    for a in it:
        first = True
        for b in it:
            yield first, False, a
            a = b
            first = False
        yield first, True, a


# Add an extra step to handle the first item, then
# we don't need a first flag and do as little as
# possible other than fast iterating and yielding.
def mark_ends__fastest_python(iterable):
    it = iter(iterable)
    for a in it:
        for b in it:
            yield (True, False, a)
            for c in it:
                yield (False, False, b)
                b = c
            yield (False, True, b)
            return
        yield (True, True, a)
        return


# Same as above, but with word variables.
def mark_ends__fastest_python_words(iterable):
    iterator = iter(iterable)
    for first in iterator:
        for second in iterator:
            yield (True, False, first)
            hold = second
            for another in iterator:
                yield (False, False, hold)
                hold = another
            yield (False, True, hold)
            return
        yield (True, True, first)
        return


# Single loop, use index for all decisions.
def mark_ends__enumerate(iterable):
    i = -1
    for i, b in enumerate(iterable):
        if i:
            yield i == 1, False, a
        a = b
    if i >= 0:
        yield i == 0, True, a


# Just append the marker to know we're at the end.
def mark_ends__marker_delay(iterable):
    it = chain(iterable, [_marker])
    for a in it:
        first = True
        for b in it:
            yield first, b is _marker, a
            a = b
            first = False


# One loop, use flag to distinguish the three states:
# - None: no previous item a
# - True: previous item a was first
# - False: previous item a was not first
def mark_ends__tristate(iterable):
    first = None
    for b in iterable:
        if first is None:
            first = True
        else:
            yield first, False, a
            first = False
        a = b
    if first is not None:
        yield first, True, a


def mark_ends__tristate_2(iterable):
    first = None
    for b in iterable:
        if first is not None:
            yield first, False, a
        a = b
        first = first is None
    if first is not None:
        yield first, True, a


# Almost only tools. One small generator.
def mark_ends__toolsish(iterable):
    items, short = tee(iterable)
    next(short, None)
    return zip(
        chain([True], repeat(False)),
        chain((False for _ in short), [True]),
        items
    )


# Optimized version of the above.
def mark_ends__fastest_tools(iterable):
    items, short = tee(iterable)
    for _ in short:
        return zip_longest(
            (True,),
            chain(compress(repeat(False), zip(short)), (True,)),
            items,
            fillvalue=False
        )
    return iter(())


# Replicating the "fastest_python" solution with tools.
def mark_ends__tools_replica(iterable):
    it = iter(iterable)
    for a in it:
        b, c, b2 = tee(it, 3)
        for _ in c:
            return chain(
                ((True, False, a),),
                zip_longest((), (), compress(b, zip(c, b2)), fillvalue=False),
                zip((False,), (True,), b2)
            )
        return iter(((True, True, a),))
    return iter(())


# Same as above, but with word variables.
def mark_ends__tools_replica_words(iterable):
    it = iter(iterable)
    for first in it:
        middle, ahead, last = tee(it, 3)
        for _ in ahead:
            middle = compress(middle, zip(ahead, last))
            return chain(
                ((True, False, first),),
                zip_longest((), (), middle, fillvalue=False),
                zip((False,), (True,), last)
            )
        return iter(((True, True, first),))
    return iter(())


# Like "flag_pairs", but instead of appending a marker and
# always checking, append an iterator that flips 'last'.
def mark_ends__end(iterable):
    first, last = True, False
    def end():
        nonlocal last
        last = True
        yield
    for a, _ in pairwise(chain(iterable, end())):
        yield first, last, a
        first = False


def mark_ends__current(iterable):
    it = iter(iterable)

    try:
        b = next(it)
    except StopIteration:
        return

    try:
        for i in count():
            a = b
            b = next(it)
            yield i == 0, False, a

    except StopIteration:
        yield i == 0, True, a


funcs = [
    mark_ends__current,
    mark_ends__improved,
    mark_ends__impr_mystyle,
    mark_ends__shortest,
    mark_ends__flag_pairs,
    mark_ends__fastest_python,
    mark_ends__enumerate,
    mark_ends__marker_delay,
    mark_ends__tristate,
    mark_ends__tristate_2,
    mark_ends__toolsish,
    mark_ends__fastest_tools,
    mark_ends__tools_replica,
    mark_ends__end,
]


from timeit import timeit
from statistics import mean, stdev
from itertools import *
import sys
import random

_marker = object()

# Correctness
for n in range(10):
    iterable = list(range(n))
    expect = list(mark_ends__current(iterable))
    for f in funcs:
        result = list(f(iterable))
        assert result == expect, (n, f)

# Speed
ns = sorted({
    n
    for i in range(41)
    for n in [round(10**(i/10))]
})
N = max(ns)
def shuffled(xs):
    return random.sample(xs, len(xs))

times = {f: {n: [] for n in ns} for f in funcs}
for i in range(5):
  #  print(i)
    for n in shuffled(ns):
        iterable = [None] * n
        reps = round((N/n)**.75)
        for f in shuffled(funcs):
            t = timeit(
                'for is_first, is_last, item in f(iterable): pass',
                'from __main__ import f, iterable; gc.enable()',
                number=reps
            ) / reps
            times[f][n].append(t)

# fastest time for each (function,n) pair, in nanoseconds
times = {f.__name__.removeprefix('mark_ends__'): [round(min(times[f][n]) * 1e9, 1) for n in ns]
    for f in funcs
}

# Times in percent of the current times
times = {
    name: [round(t / td * 100, 2) for t, td in zip(times[name], times['current'])]
    for name in times
}

# Plotting
import matplotlib.pyplot as plt
import matplotlib
colors = iter(matplotlib.cm.tab20(range(20)))
for name, y in times.items():
    plt.semilogx(ns, y, label=name, color=next(colors))
plt.ylim(0, 130)
#plt.ylim(bottom=0)
# plt.title('sliding_window([None]*10000, n)', weight='bold')
plt.title('mark_ends', weight='bold')
plt.xlabel('len(iterable)', weight='bold')
plt.ylabel('time in % of current solution time', weight='bold')
#plt.legend(loc='upper right')
plt.legend(loc='lower center', ncol=3)
plt.savefig('mark_ends.png', dpi=200)

print(sys.version)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions