Mike Gerwitz

Activist for User Freedom

aboutsummaryrefslogtreecommitdiffstats
blob: b4f22e1af86c282dda427378d3e8fd68a7d9f79c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
<?php


function gen_re_quick( $data )
{
    $re = ( '^' . gen_re( $data, 0 ) );

    // attempt to simplify the regex (we're not going to put a lot of effort into
    // this)
    return re_simplify( $re );
}


function gen_re( $data, $offset )
{
    // if we've reached the end of the zip length, or if there's no more zips to
    // look at, then stop
    if ( ( count( $data ) === 0 )
        || ( $offset === 5 )
    )
    {
        return '';
    }

    $out = '(';

    // loop through each digit at the current offset
    $last = '';
    foreach ( $data as $zip )
    {
        if ( !( isset( $zip[ $offset ] ) ) )
        {
            continue;
        }

        $digit = $zip[ $offset ];

        // if we've already seen this digit in the current position, then
        // continue
        if ( $digit === $last )
        {
            continue;
        }

        // we're going to recurse now, delimiting allowable digits with pipes
        // (for 'OR'); we'll recurse on a sublist that matches the zip up to
        // (and including) the current digit (to do this, note that we only need
        // to check the current digit, since our current list is already a
        // sublist of the parent list up to the current point)
        $prefix = substr( $zip, 0, $offset + 1 );

        $out .= ( $last === '' ) ? '' : '|';
        $out .= $digit . gen_re(
            filter_data( $data, $digit, $offset ),
            ( $offset + 1 )
        );

        $last = $digit;
    }

    return $out . ')';
}

function filter_data( $data, $chr, $offset )
{
    $ret = array();

    foreach ( $data as $val )
    {
        if ( $val[ $offset] === $chr )
        {
            $ret[] = $val;
        }
    }

    return $ret;
}

function re_simplify( $re )
{
    // the only simplification we currently do is joining sequential digit ORs
    // into a character range (e.g. (1|2|3|4) becomes [1-4])
    return preg_replace_callback( '/\([0-9](\|[0-9])*\)/', function( $results )
    {
        $match  = $results[ 0 ];
        $digits = explode( '|', str_replace( array( '(', ')' ), '', $match ) );

        // are the digits sequential (we will only perform this optimization if
        // there's more than 3 digits, since otherwise the replacement would
        // result in a string of equal or greater length)?
        if ( ( count( $digits ) > 3 ) && is_seq( $digits ) )
        {
            return sprintf( '[%d-%d]',
                $digits[ 0 ],
                $digits[ count( $digits ) - 1 ]
            );
        }
        elseif ( count( $digits ) === 1 )
        {
            // if there's only one digit, then that's all we need to return
            return $digits[ 0 ];
        }

        return '[' . implode( '', $digits ) . ']';
    }, $re );
}

function is_seq( $digits, $last = '' )
{
    // stop recursing once we're out of digits
    if ( count( $digits ) === 0 )
    {
        return true;
    }

    // grab the current digit and remove it from the list (this has the effect
    // of both cons and cdr)
    $digit = (int)( array_shift( $digits ) );

    // consider this a sequence if this digit is one more than the last (or if
    // there is no last digit) and if the following digit is sequential
    return ( ( $last === '' ) || ( $digit === ( $last + 1) ) )
        && is_seq( $digits, $digit );
}