Factorio analysis: data munging

.everyoneloves__top-leaderboard:empty,.everyoneloves__mid-leaderboard:empty,.everyoneloves__bot-mid-leaderboard:empty{ margin-bottom:0;
}

This project is... a little ridiculous. It's working, but it's a complete mess.

Data about Factorio's game economy are pulled from the wiki via the MediaWiki API, scrubbed, preprocessed, and thrown into Scipy for linear programming analysis using the MOSEK interior point method.

The pull script only depends on requests:

#!/usr/bin/env python3



import json, lzma, re

from os.path import getsize

from requests import Session

from sys import stdout



session = Session()





def get_mediawiki(content=False, progress=None, **kwargs):

    """

    https://stable.wiki.factorio.com is an instance of MediaWiki.

    The API endpoint is

    https://stable.wiki.factorio.com/api.php

    """

    params = {'action': 'query',

              'format': 'json',

              **kwargs}

    if content:

        params.update({'prop': 'revisions',

                       'rvprop': 'content'})

    so_far = 0

    while True:

        resp = session.get('https://stable.wiki.factorio.com/api.php',

                           params=params)

        resp.raise_for_status()



        doc = resp.json()

        pages = doc['query']['pages'].values()

        if content:

            full_pages = tuple(p for p in pages if 'revisions' in p)

            if progress:

                so_far += len(full_pages)

                progress(so_far, len(pages))

            yield from full_pages

        else:

            yield from pages



        if 'batchcomplete' in doc:

            break

        params.update(doc['continue'])





def get_category(name, content=False, progress=None, **kwargs):

    return get_mediawiki(content=content, progress=progress,

                         generator='categorymembers',

                         gcmtitle=f'Category:{name}',

                         gcmtype='page',

                         gcmlimit=500,

                         **kwargs)





def get_archived_titles():

    return get_category('Archived')





def get_infoboxes(progress):

    return get_category('Infobox_page', content=True, progress=progress)





def get_inter_tables(titles, progress):

    return get_mediawiki(content=True, progress=progress,

                         titles='|'.join(titles))





line_re = re.compile(r'ns*|')

var_re = re.compile(

    r'^s*'

    r'(S+)'

    r's*=s*'

    r'(.+?)'

    r's*$')





def parse_infobox(page):

    """

    Example:



    {{Infobox

    |map-color = 006090

    |prototype-type = mining-drill

    |internal-name = burner-mining-drill

    |expensive-total-raw = Time, 8 + Iron plate, 30 + Stone, 10

    |expensive-recipe = Time, 4 + Iron gear wheel, 6 + Iron plate, 6 + Stone furnace, 2

    |category = Production

    |image=Burner-Mining-Drill-Example

    |health = 150

    |stack-size=50

    |dimensions=2×2

    |energy=300 {{Translation|kW}} burner

    |mining-power=2.5

    |mining-speed=0.35

    |mining-area=2×2

    |pollution=10

    |valid-fuel = Wood + Raw wood + Wooden chest + Coal + Solid fuel + Small electric pole + Rocket fuel + Nuclear fuel

    |recipe = Time, 2 + Iron gear wheel, 3 + Iron plate, 3 + Stone furnace, 1

    |total-raw = Time, 4 + Iron plate, 9 + Stone, 5

    |producers=Manual + Assembling machine 2 + Assembling machine 3

    }}<noinclude>

    [[Category:Infobox page]]

    </noinclude>



    Splitting on newline isn't a great idea, because

    https://www.mediawiki.org/wiki/Help:Templates#Named_parameters

    shows that only the pipe is mandatory as a separator. However, only

    splitting on pipe is worse, because there are pipes on the inside of links.

    """



    content = page['revisions'][0]['*']

    entries = (

        var_re.match(e)

        for e in line_re.split(

            content.split('{{', maxsplit=1)[1]

            .rsplit('}}', maxsplit=1)[0]

        )

    )

    title = page['title'].split(':', maxsplit=1)[1]

    d = {'pageid': page['pageid'],

         'title': title}

    d.update(dict(e.groups() for e in entries if e))

    return d





part_tok = r's*([^|{}]*?)'

border_tok = r's*|'

row_image_re = re.compile(

    r'{{s*'

    r'(?P<type>w+)'

    f'{border_tok}'

    f'{part_tok}'

    r'(?:'

       f'{border_tok}'    

       f'{part_tok}'

    r')?'

    r'(?:'

       f'{border_tok}'

       r'[^{}]*'

    r')?'

    r'}}s*'

    r'(?P<sep>'

      r'(?:'

        r'|||+|→'

      r')?'

    r')',

)





def iter_cells(row):

    """

    e.g.

    | {{Icon|Solid fuel from light oil||}}

    || {{icon|Light oil|10}} + {{icon|time|3}}

    || {{icon|Solid fuel|1}}

    or

    | {{Imagelink|Oil refinery}}

    || {{Imagelink|Basic oil processing}}

    || {{Icon|Crude oil|100}} + {{icon|Time|5}}

    → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    """



    cell = 

    for m in row_image_re.finditer(row):

        if m.group('sep') == '||':

            cell.append(m.groups()[:-1])

            yield cell

            cell = 

        else:

            cell.append(m.groups())

    if cell:

        yield cell





def parse_inter_table(page):

    """

    Example:



    {| class="wikitable"

    ! Building !! Process !! Results

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Basic oil processing}} || {{Icon|Crude oil|100}} + {{icon|Time|5}} → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Advanced oil processing}} || {{Icon|Crude oil|100}} + {{icon|Water|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|10}} + ({{Icon|Light oil|45}} {{Icon|Petroleum gas|55}})

    |-

    | {{Imagelink|Oil refinery}} || {{imagelink|Coal liquefaction}} || {{icon|Coal|10}} + {{Icon|Heavy oil|25}} + {{icon|Steam|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|35}} + ({{Icon|Light oil|15}} + {{Icon|Petroleum gas|20}})

    |}



    or



    {| class="wikitable"

    ! Process !! Input !! Output

    |-

    | {{Icon|Solid fuel from heavy oil||}} || {{icon|Heavy oil|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from light oil||}} || {{icon|Light oil|10}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from petroleum gas||}} || {{icon|Petroleum gas|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    |}

    """

    title = page['title']

    content = page['revisions'][0]['*']

    if '{|' not in content:

        return title, {}



    rows = 

    body = (content

            .replace('n', '')

            .split('{|', maxsplit=1)[1]

            .rsplit('|}', maxsplit=1)[0])

    row_strings = body.split('|-')

    heads = tuple(h.strip().lower() for h in row_strings[0]

                  .split('!', maxsplit=1)[1]

                  .split('!!'))



    for line in row_strings[1:]:

        inputs = {}

        outputs = {}

        row = {'inputs': inputs, 'outputs': outputs}

        for head, parts in zip(heads, iter_cells(line)):

            if head in ('process', 'building'):

                row[head.lower()] = parts[0][1]

                continue

            elif head not in ('input', 'output', 'results'):

                if head == '':

                    return title, {}  # Space science pack edge case

                raise ValueError(f'Unrecognized head {head}')



            if 'input' in head:

                side = inputs

            elif 'output' in head:

                side = outputs

            else:

                side = inputs

                if 'results' not in head:

                    raise ValueError(f'Unexpected heading {head}')

            for part in parts:

                res_type = part[0].lower()

                if res_type != 'icon':

                    raise ValueError(f'Unexpected resource type {res_type}')

                side[part[1]] = int(part[2])

                if 'results' in head and len(part) == 4 and part[-1] == '→':

                    side = outputs



        if inputs or outputs:

            rows.append(row)



    return title, {'recipes': rows}





def inter_needed(items):

    return (i['title'] for i in items if

            not i['archived']

            and i.get('category') == 'Intermediate products'

            and not ('cost' in i or 'recipe' in i))





def save(fn, recipes):

    with lzma.open(fn, 'wt') as f:

        json.dump(recipes, f, indent=4)





def main():

    def progress(so_far, total):

        print(f'{so_far}/{total} {so_far/total:.0%}', end='r')

        stdout.flush()



    print('Getting archived items... ', end='')

    archived_titles = {p['title'] for p in get_archived_titles()}

    print(len(archived_titles))



    print('Getting item content...')

    items = tuple(parse_infobox(p) for p in get_infoboxes(progress))

    items_by_name = {i['title']: i for i in items}

    for item in items:

        item['archived'] = item['title'] in archived_titles



    print('nFilling in intermediate products...')

    inter_tables = get_inter_tables(inter_needed(items), progress)

    used = 0

    for table_page in inter_tables:

        try:

            title, recipes = parse_inter_table(table_page)

            if recipes:

                used += 1

                items_by_name[title].update(recipes)

        except Exception as e:

            print(f'nWarning: {table_page["title"]} failed to parse - {e}')

    print(f'n{used} intermediate tables used.')



    fn = 'items.json.xz'

    print(f'Saving to {fn}... ', end='')

    save(fn, items_by_name)

    print(f'{getsize(fn)//1024} kiB')





if __name__ == '__main__':

    main()

You need to run it before any of the next steps. After the data are pulled, run the preprocessing script:

#!/usr/bin/env python3



import json, lzma, re

import numpy as np

from collections import defaultdict

from os.path import getsize

from scipy.sparse import lil_matrix, save_npz

from sys import stdout

from typing import Dict, Iterable, Set, Sequence





power_re = re.compile(r'([0-9.]+) .*([kMG])[WJ]')



si_facs = {

    c: 10**(3*i) for i, c in enumerate(('', 'k', 'M', 'G'))

}





class Item:

    def __init__(self, data: dict):

        self.data = data

        (

            self.archived,

            self.cost,

            self.cost_multiplier,

            self.crafting_speed,

            self.dimensions,

            self.energy,

            self.fluid_consumption,

            self.fuel_value,

            self.mining_hardness,

            self.mining_power,

            self.mining_speed,

            self.mining_time,

            self.pollution,

            self.power_output,

            self.producers,

            self.prototype_type,

            self.recipe,

            self.recipes,

            self.title,

            self.valid_fuel

        ) = (None,)*20

        self.__dict__.update({k.replace('-', '_'): v

                              for k, v in data.items()})

        self.fill_gaps()



    def fill_gaps(self):

        if self.prototype_type == 'technology':

            self.producers = 'Lab'

        elif self.title in ('Flamethrower turret', 'Gun turret',

                            'Laser turret'):

            self.producers = 'Assembling machine + manual'

        elif self.title == 'Space science pack':

            self.recipe = 'Time, 41.25 + Rocket part, 100 = ' 

                          'Space science pack, 1000'

        elif self.title == 'Steam':

            ex_rate = 10e6 * 60 / 5.82e6

            self.recipes = (

                {

                    'process': 'Steam165 (Boiler)',

                    'building': 'Boiler',

                    'inputs': {

                        'Water': 60,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam165': 60

                    }

                },

                {

                    'process': 'Steam500 (Heat exchanger)',

                    'building': 'Heat exchanger',

                    'inputs': {

                        'Water': ex_rate,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam500': ex_rate

                    }

                }

            )



    def __str__(self) -> str:

        return self.title



    @property

    def keep(self) -> bool:

        return (

            (not self.archived) and

            (self.title not in {'Rock', 'Tree'}) and

            (

                any(self.data.get(k) for k in ('cost', 'recipe', 'recipes'))

                or 'mining-hardness' in self.data

                or self.title in {'Crude oil',

                                  'Water',

                                  'Space science pack',

                                  'Steam'}

            )

        )



    def get_recipes(self) -> Iterable:

        if self.recipes:

            for rates in self.recipes:

                fac = RecipeFactory(self, rates=rates)

                yield from fac.make()

        else:

            fac = RecipeFactory(self)

            yield from fac.make()



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                (float(self.mining_power) - mining_hardness)

                * float(self.mining_speed) / mining_time

        )





all_items: Dict[str, Item] = None





class ManualMiner:

    def __init__(self, tool: Item):

        self.tool = tool

        self.title = f'Manual with {tool}'

        self.pollution = 0

        self.dimensions = '0×0'



    def __str__(self) -> str:

        return self.title



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                0.6 * (float(self.tool.mining_power) - mining_hardness)

                / mining_time

        )





class Recipe:

    def __init__(self, resource: str, producer: Item, rates: dict,

                 title: str = None):

        self.resource = resource

        if title:

            self.title = title

        else:

            self.title = f'{resource} ({producer})'



        self.rates = dict(rates)

        self.producer = producer

        self.multiply_producer(producer)



    def __str__(self) -> str:

        return self.title



    def multiply_producer(self, prod: Item):

        if prod.title in {'Boiler', 'Heat exchanger', 'Solar panel',

                          'Steam engine', 'Steam turbine'}:

            pass  # no crafting rate modifier

        elif prod.title == 'Nuclear reactor':

            self.rates['Heat'] = parse_power(prod.energy)

        else:

            rate = float(prod.crafting_speed)

            for k in self.rates:

                self.rates[k] *= rate





class MiningRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 mining_hardness: float, mining_time: float, title: str = ''):

        self.mining_hardness, self.mining_time = mining_hardness, mining_time

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, miner: Item):

        self.rates[self.resource] = self.producer.mine_rate(

            self.mining_hardness, self.mining_time

        )

        if self.resource == 'Uranium ore':

            self.rates['Sulphuric acid'] = -self.rates[self.resource]





class TechRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 cost_multiplier: float, title: str = ''):

        self.cost_multiplier = cost_multiplier

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, lab: Item):

        self.rates[self.resource] /= self.cost_multiplier





class FluidRecipe(Recipe):

    # Pumpjacks, offshore pumps

    def multiply_producer(self, producer: Item):

        if producer.title == 'Pumpjack':

            yield_factor = 1.00  # Assumed

            rate = 10*yield_factor

        elif producer.title == 'Offshore pump':

            rate = 1200

        else:

            raise NotImplementedError()

        self.rates[self.resource] = rate





class RecipeFactory:

    def __init__(self, resource: Item, rates: dict = None):

        self.resource = resource

        self.producers = ()

        if rates:

            self.producers, self.title, self.rates = self.intermediate(rates)

        else:

            self.title = None

            needs_producers = False

            recipe = resource.recipe or resource.cost

            if recipe:

                self.rates = self.parse_recipe(recipe)

                if resource.prototype_type == 'technology':

                    self.producers = (all_items['lab'],)

                else:

                    needs_producers = True

            else:

                if resource.mining_time or 

                        resource.title in {'Crude oil', 'Water'}:

                    self.rates = {}

                    if resource.title != 'Raw wood':

                        needs_producers = True

                else:

                    raise NotImplementedError()

            if needs_producers:

                self.producers = tuple(parse_producers(resource.producers))



    def __str__(self) -> str:

        return self.title



    def intermediate(self, rates) -> (Iterable[Item], str, dict):

        building = rates.get('building')

        if building:

            producers = (all_items[building.lower()],)

        else:

            producers = parse_producers(self.resource.producers)

        title = rates['process']

        sane_rates = self.calc_recipe(rates['inputs'], rates['outputs'])

        return producers, title, sane_rates



    @staticmethod

    def parse_side(s: str) -> Dict[str, float]:

        out = {}

        for pair in s.split('+'):

            k, v = pair.split(',')

            out[k.strip()] = float(v.strip())

        return out



    @staticmethod

    def calc_recipe(inputs: Dict[str, float],

                    outputs: Dict[str, float]) -> Dict[str, float]:

        rates = defaultdict(float, outputs)

        if 'time' in inputs:

            k = 'time'

        else:

            k = 'Time'

        t = inputs.pop(k)

        for k in rates:

            rates[k] /= t

        for k, v in inputs.items():

            rates[k] -= v / t

        return rates



    def parse_recipe(self, recipe: str) -> Dict[str, float]:

        if '=' in recipe:

            inputs, outputs = recipe.split('=')

            outputs = self.parse_side(outputs)

        else:

            inputs = recipe

            outputs = {self.resource.title: 1}



        return self.calc_recipe(self.parse_side(inputs), outputs)



    def produce(self, cls, producer, **kwargs):

        kwargs.setdefault('title', self.title)

        recipe = cls(self.resource.title, producer, self.rates, **kwargs)

        if producer.pollution:

            recipe.rates['Pollution'] = float(producer.pollution)



        dims = tuple(float(x) for x in producer.dimensions.split('×'))

        recipe.rates['Area'] = dims[0] * dims[1]



        return recipe



    def for_energy(self, cls, **kwargs) -> Iterable[Recipe]:

        for producer in self.producers:

            energy = -parse_power(producer.energy)



            if 'electric' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Energy'] = energy

                yield recipe



            elif 'heat' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Heat'] = energy

                yield recipe



            elif 'burner' in producer.energy:

                for fuel_name in producer.valid_fuel.split('+'):

                    fuel_name = fuel_name.strip().lower()

                    fuel = all_items[fuel_name]

                    fuel_value = parse_power(fuel.fuel_value)

                    new_kwargs = dict(kwargs)

                    if self.title:

                        title = self.title

                    else:

                        title = f'{self.resource} ({producer})'

                    new_kwargs['title'] = f'{title} fueled by {fuel_name}'



                    recipe = self.produce(cls, producer, **new_kwargs)

                    recipe.rates[fuel.title] = energy / fuel_value

                    yield recipe

            else:

                raise NotImplementedError()



    tree_re = re.compile(r'(d+) .*?|([^}|]+)}')



    def wood_mining(self) -> Iterable[MiningRecipe]:

        miners = tuple(

            ManualMiner(tool)

            for tool in all_items.values()

            if tool.prototype_type == 'mining-tool'

        )

        for m in self.tree_re.finditer(self.resource.mining_time):

            mining_time, source = int(m[1]), m[2]

            for miner in miners:

                yield self.produce(

                    MiningRecipe, miner,

                    mining_hardness=float(self.resource.mining_hardness),

                    mining_time=mining_time,

                    title=f'{self.resource} ({miner} from {source})')



    def make(self) -> Iterable[Recipe]:

        if self.rates:

            if self.resource.prototype_type == 'technology':

                yield self.produce(

                    TechRecipe, self.producers[0],

                    cost_multiplier=float(self.resource.cost_multiplier))

            elif self.resource.title == 'Energy':

                yield self.produce(Recipe, self.producers[0])

            else:

                yield from self.for_energy(Recipe)

        elif self.resource.title == 'Raw wood':

            yield from self.wood_mining()

        elif self.resource.mining_time:

            yield from self.for_energy(

                MiningRecipe,

                mining_hardness=float(self.resource.mining_hardness),

                mining_time=float(self.resource.mining_time))

        elif self.resource.title == 'Crude oil':

            yield from self.for_energy(FluidRecipe)

        elif self.resource.title == 'Water':

            yield self.produce(FluidRecipe, self.producers[0])

        else:

            raise NotImplementedError()





def parse_power(s: str) -> float:

    m = power_re.search(s)

    return float(m[1]) * si_facs[m[2]]





def items_of_type(t: str) -> Iterable[Item]:

    return (i for i in all_items.values()

            if i.prototype_type == t)





barrel_re = re.compile(r'empty .+ barrel')





def parse_producers(s: str) -> Iterable[Item]:

    for p in s.split('+'):

        p = p.strip().lower()

        if p == 'furnace':

            yield from items_of_type('furnace')

        elif p == 'assembling machine':

            yield from (all_items[f'assembling machine {i}']

                        for i in range(1, 4))

        elif p == 'mining drill':

            yield from (all_items[f'{t} mining drill']

                        for t in ('burner', 'electric'))

        elif p == 'manual' or barrel_re.match(p):

            continue

        else:

            yield all_items[p]





def trim(items: dict):

    to_delete = tuple(k for k, v in items.items() if not v.keep)

    print(f'Dropping {len(to_delete)} items...')

    for k in to_delete:

        del items[k]





def energy_data() -> dict:

    solar_ave = parse_power(next(

        s for s in all_items['solar panel'].power_output.split('<br/>')

        if 'average' in s))



    eng = all_items['steam engine']

    eng_rate = float(eng.fluid_consumption

                     .split('/')[0])

    eng_power = parse_power(eng.power_output)



    turbine = all_items['steam turbine']

    turbine_rate = float(turbine.fluid_consumption

                         .split('/')[0])

    turbine_power_500 = 5.82e6  # ignore non-precise data and use this instead

    turbine_power_165 = 1.8e6   # from wiki page body



    return {

        'title': 'Energy',

        'recipes': (

            {

                'building': 'Solar panel',

                'process': 'Energy (Solar panel)',

                'inputs': {

                    'Time': 1

                },

                'outputs': {

                    'Energy': solar_ave

                }

            },

            {

                'building': 'Steam engine',

                'process': 'Energy (Steam engine)',

                'inputs': {

                    'Time': 1,

                    'Steam165': eng_rate

                },

                'outputs': {

                    'Energy': eng_power

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 165C)',

                'inputs': {

                    'Time': 1,

                    'Steam165': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_165

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 500C)',

                'inputs': {

                    'Time': 1,

                    'Steam500': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_500

                }

            }

        )

    }





def load(fn: str):

    with lzma.open(fn) as f:

        global all_items

        all_items = {k.lower(): Item(d) for k, d in json.load(f).items()}

    all_items['energy'] = Item(energy_data())





def get_recipes() -> (Dict[str, Recipe], Set[str]):

    recipes = {}

    resources = set()

    for item in all_items.values():

        item_recipes = tuple(item.get_recipes())

        recipes.update({i.title: i for i in item_recipes})

        for recipe in item_recipes:

            resources.update(recipe.rates.keys())



    return recipes, resources





def field_size(names: Iterable) -> int:

    return max(len(str(o)) for o in names)





def write_csv_for_r(recipes: Sequence[Recipe], resources: Sequence[str],

                    fn: str):

    # Recipes going down, resources going right



    rec_width = field_size(recipes)

    float_width = 15

    col_format = f'{{:{float_width+8}}}'

    rec_format = 'n{:' + str(rec_width+1) + '}'



    with lzma.open(fn, 'wt') as f:

        f.write(' '*(rec_width+1))

        for res in resources:

            f.write(col_format.format(f'{res},'))



        for rec in recipes:

            f.write(rec_format.format(f'{rec},'))

            for res in resources:

                x = rec.rates.get(res, 0)

                col_format = f'{{:+{len(res)}.{float_width}e}},'

                f.write(col_format.format(x))





def write_for_numpy(recipes: Sequence[Recipe], resources: Sequence[str],

                    meta_fn: str, npz_fn: str):

    rec_names = [r.title for r in recipes]

    w_rec = max(len(r) for r in rec_names)

    recipe_names = np.array(rec_names, copy=False, dtype=f'U{w_rec}')



    w_res = max(len(r) for r in resources)

    resource_names = np.array(resources, copy=False, dtype=f'U{w_res}')



    np.savez_compressed(meta_fn, recipe_names=recipe_names, resource_names=resource_names)



    rec_mat = lil_matrix((len(resources), len(recipes)))

    for j, rec in enumerate(recipes):

        for res, q in rec.rates.items():

            i = resources.index(res)

            rec_mat[i, j] = q

    save_npz(npz_fn, rec_mat.tocsr())





def file_banner(fn):

    print(f'{fn} {getsize(fn)//1024} kiB')





def main():

    fn = 'items.json.xz'

    print(f'Loading {fn}... ', end='')

    load(fn)

    print(f'{len(all_items)} items')



    trim(all_items)



    print('Calculating recipes... ', end='')

    recipes, resources = get_recipes()

    print(f'{len(recipes)} recipes, {len(resources)} resources')



    resources = sorted(resources)

    recipes = sorted(recipes.values(), key=lambda i: i.title)



    print('Saving files for numpy...')

    meta_fn, npz_fn = 'recipe-names.npz', 'recipes.npz'

    write_for_numpy(recipes, resources, meta_fn, npz_fn)

    file_banner(meta_fn)

    file_banner(npz_fn)



    fn = 'recipes.csv.xz'

    print(f'Saving recipes for use by R...')

    stdout.flush()

    write_csv_for_r(recipes, resources, fn)

    file_banner(fn)





if __name__ == '__main__':

    main()

That's followed by an analysis script that I won't post here, to constrain the scope of this first review.

The main thing that needs work is the recipe factory code. It sprinkles logic about item types where it doesn't belong, and that really needs to be improved. I have some ideas about how to do that, but I'd like to hear from the community (on that, and any other wrinkles you find).

asked 18 mins ago

Reinderien

5,280926

add a comment |

This project is... a little ridiculous. It's working, but it's a complete mess.

Data about Factorio's game economy are pulled from the wiki via the MediaWiki API, scrubbed, preprocessed, and thrown into Scipy for linear programming analysis using the MOSEK interior point method.

The pull script only depends on requests:

#!/usr/bin/env python3



import json, lzma, re

from os.path import getsize

from requests import Session

from sys import stdout



session = Session()





def get_mediawiki(content=False, progress=None, **kwargs):

    """

    https://stable.wiki.factorio.com is an instance of MediaWiki.

    The API endpoint is

    https://stable.wiki.factorio.com/api.php

    """

    params = {'action': 'query',

              'format': 'json',

              **kwargs}

    if content:

        params.update({'prop': 'revisions',

                       'rvprop': 'content'})

    so_far = 0

    while True:

        resp = session.get('https://stable.wiki.factorio.com/api.php',

                           params=params)

        resp.raise_for_status()



        doc = resp.json()

        pages = doc['query']['pages'].values()

        if content:

            full_pages = tuple(p for p in pages if 'revisions' in p)

            if progress:

                so_far += len(full_pages)

                progress(so_far, len(pages))

            yield from full_pages

        else:

            yield from pages



        if 'batchcomplete' in doc:

            break

        params.update(doc['continue'])





def get_category(name, content=False, progress=None, **kwargs):

    return get_mediawiki(content=content, progress=progress,

                         generator='categorymembers',

                         gcmtitle=f'Category:{name}',

                         gcmtype='page',

                         gcmlimit=500,

                         **kwargs)





def get_archived_titles():

    return get_category('Archived')





def get_infoboxes(progress):

    return get_category('Infobox_page', content=True, progress=progress)





def get_inter_tables(titles, progress):

    return get_mediawiki(content=True, progress=progress,

                         titles='|'.join(titles))





line_re = re.compile(r'ns*|')

var_re = re.compile(

    r'^s*'

    r'(S+)'

    r's*=s*'

    r'(.+?)'

    r's*$')





def parse_infobox(page):

    """

    Example:



    {{Infobox

    |map-color = 006090

    |prototype-type = mining-drill

    |internal-name = burner-mining-drill

    |expensive-total-raw = Time, 8 + Iron plate, 30 + Stone, 10

    |expensive-recipe = Time, 4 + Iron gear wheel, 6 + Iron plate, 6 + Stone furnace, 2

    |category = Production

    |image=Burner-Mining-Drill-Example

    |health = 150

    |stack-size=50

    |dimensions=2×2

    |energy=300 {{Translation|kW}} burner

    |mining-power=2.5

    |mining-speed=0.35

    |mining-area=2×2

    |pollution=10

    |valid-fuel = Wood + Raw wood + Wooden chest + Coal + Solid fuel + Small electric pole + Rocket fuel + Nuclear fuel

    |recipe = Time, 2 + Iron gear wheel, 3 + Iron plate, 3 + Stone furnace, 1

    |total-raw = Time, 4 + Iron plate, 9 + Stone, 5

    |producers=Manual + Assembling machine 2 + Assembling machine 3

    }}<noinclude>

    [[Category:Infobox page]]

    </noinclude>



    Splitting on newline isn't a great idea, because

    https://www.mediawiki.org/wiki/Help:Templates#Named_parameters

    shows that only the pipe is mandatory as a separator. However, only

    splitting on pipe is worse, because there are pipes on the inside of links.

    """



    content = page['revisions'][0]['*']

    entries = (

        var_re.match(e)

        for e in line_re.split(

            content.split('{{', maxsplit=1)[1]

            .rsplit('}}', maxsplit=1)[0]

        )

    )

    title = page['title'].split(':', maxsplit=1)[1]

    d = {'pageid': page['pageid'],

         'title': title}

    d.update(dict(e.groups() for e in entries if e))

    return d





part_tok = r's*([^|{}]*?)'

border_tok = r's*|'

row_image_re = re.compile(

    r'{{s*'

    r'(?P<type>w+)'

    f'{border_tok}'

    f'{part_tok}'

    r'(?:'

       f'{border_tok}'    

       f'{part_tok}'

    r')?'

    r'(?:'

       f'{border_tok}'

       r'[^{}]*'

    r')?'

    r'}}s*'

    r'(?P<sep>'

      r'(?:'

        r'|||+|→'

      r')?'

    r')',

)





def iter_cells(row):

    """

    e.g.

    | {{Icon|Solid fuel from light oil||}}

    || {{icon|Light oil|10}} + {{icon|time|3}}

    || {{icon|Solid fuel|1}}

    or

    | {{Imagelink|Oil refinery}}

    || {{Imagelink|Basic oil processing}}

    || {{Icon|Crude oil|100}} + {{icon|Time|5}}

    → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    """



    cell = 

    for m in row_image_re.finditer(row):

        if m.group('sep') == '||':

            cell.append(m.groups()[:-1])

            yield cell

            cell = 

        else:

            cell.append(m.groups())

    if cell:

        yield cell





def parse_inter_table(page):

    """

    Example:



    {| class="wikitable"

    ! Building !! Process !! Results

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Basic oil processing}} || {{Icon|Crude oil|100}} + {{icon|Time|5}} → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Advanced oil processing}} || {{Icon|Crude oil|100}} + {{icon|Water|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|10}} + ({{Icon|Light oil|45}} {{Icon|Petroleum gas|55}})

    |-

    | {{Imagelink|Oil refinery}} || {{imagelink|Coal liquefaction}} || {{icon|Coal|10}} + {{Icon|Heavy oil|25}} + {{icon|Steam|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|35}} + ({{Icon|Light oil|15}} + {{Icon|Petroleum gas|20}})

    |}



    or



    {| class="wikitable"

    ! Process !! Input !! Output

    |-

    | {{Icon|Solid fuel from heavy oil||}} || {{icon|Heavy oil|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from light oil||}} || {{icon|Light oil|10}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from petroleum gas||}} || {{icon|Petroleum gas|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    |}

    """

    title = page['title']

    content = page['revisions'][0]['*']

    if '{|' not in content:

        return title, {}



    rows = 

    body = (content

            .replace('n', '')

            .split('{|', maxsplit=1)[1]

            .rsplit('|}', maxsplit=1)[0])

    row_strings = body.split('|-')

    heads = tuple(h.strip().lower() for h in row_strings[0]

                  .split('!', maxsplit=1)[1]

                  .split('!!'))



    for line in row_strings[1:]:

        inputs = {}

        outputs = {}

        row = {'inputs': inputs, 'outputs': outputs}

        for head, parts in zip(heads, iter_cells(line)):

            if head in ('process', 'building'):

                row[head.lower()] = parts[0][1]

                continue

            elif head not in ('input', 'output', 'results'):

                if head == '':

                    return title, {}  # Space science pack edge case

                raise ValueError(f'Unrecognized head {head}')



            if 'input' in head:

                side = inputs

            elif 'output' in head:

                side = outputs

            else:

                side = inputs

                if 'results' not in head:

                    raise ValueError(f'Unexpected heading {head}')

            for part in parts:

                res_type = part[0].lower()

                if res_type != 'icon':

                    raise ValueError(f'Unexpected resource type {res_type}')

                side[part[1]] = int(part[2])

                if 'results' in head and len(part) == 4 and part[-1] == '→':

                    side = outputs



        if inputs or outputs:

            rows.append(row)



    return title, {'recipes': rows}





def inter_needed(items):

    return (i['title'] for i in items if

            not i['archived']

            and i.get('category') == 'Intermediate products'

            and not ('cost' in i or 'recipe' in i))





def save(fn, recipes):

    with lzma.open(fn, 'wt') as f:

        json.dump(recipes, f, indent=4)





def main():

    def progress(so_far, total):

        print(f'{so_far}/{total} {so_far/total:.0%}', end='r')

        stdout.flush()



    print('Getting archived items... ', end='')

    archived_titles = {p['title'] for p in get_archived_titles()}

    print(len(archived_titles))



    print('Getting item content...')

    items = tuple(parse_infobox(p) for p in get_infoboxes(progress))

    items_by_name = {i['title']: i for i in items}

    for item in items:

        item['archived'] = item['title'] in archived_titles



    print('nFilling in intermediate products...')

    inter_tables = get_inter_tables(inter_needed(items), progress)

    used = 0

    for table_page in inter_tables:

        try:

            title, recipes = parse_inter_table(table_page)

            if recipes:

                used += 1

                items_by_name[title].update(recipes)

        except Exception as e:

            print(f'nWarning: {table_page["title"]} failed to parse - {e}')

    print(f'n{used} intermediate tables used.')



    fn = 'items.json.xz'

    print(f'Saving to {fn}... ', end='')

    save(fn, items_by_name)

    print(f'{getsize(fn)//1024} kiB')





if __name__ == '__main__':

    main()

You need to run it before any of the next steps. After the data are pulled, run the preprocessing script:

#!/usr/bin/env python3



import json, lzma, re

import numpy as np

from collections import defaultdict

from os.path import getsize

from scipy.sparse import lil_matrix, save_npz

from sys import stdout

from typing import Dict, Iterable, Set, Sequence





power_re = re.compile(r'([0-9.]+) .*([kMG])[WJ]')



si_facs = {

    c: 10**(3*i) for i, c in enumerate(('', 'k', 'M', 'G'))

}





class Item:

    def __init__(self, data: dict):

        self.data = data

        (

            self.archived,

            self.cost,

            self.cost_multiplier,

            self.crafting_speed,

            self.dimensions,

            self.energy,

            self.fluid_consumption,

            self.fuel_value,

            self.mining_hardness,

            self.mining_power,

            self.mining_speed,

            self.mining_time,

            self.pollution,

            self.power_output,

            self.producers,

            self.prototype_type,

            self.recipe,

            self.recipes,

            self.title,

            self.valid_fuel

        ) = (None,)*20

        self.__dict__.update({k.replace('-', '_'): v

                              for k, v in data.items()})

        self.fill_gaps()



    def fill_gaps(self):

        if self.prototype_type == 'technology':

            self.producers = 'Lab'

        elif self.title in ('Flamethrower turret', 'Gun turret',

                            'Laser turret'):

            self.producers = 'Assembling machine + manual'

        elif self.title == 'Space science pack':

            self.recipe = 'Time, 41.25 + Rocket part, 100 = ' 

                          'Space science pack, 1000'

        elif self.title == 'Steam':

            ex_rate = 10e6 * 60 / 5.82e6

            self.recipes = (

                {

                    'process': 'Steam165 (Boiler)',

                    'building': 'Boiler',

                    'inputs': {

                        'Water': 60,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam165': 60

                    }

                },

                {

                    'process': 'Steam500 (Heat exchanger)',

                    'building': 'Heat exchanger',

                    'inputs': {

                        'Water': ex_rate,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam500': ex_rate

                    }

                }

            )



    def __str__(self) -> str:

        return self.title



    @property

    def keep(self) -> bool:

        return (

            (not self.archived) and

            (self.title not in {'Rock', 'Tree'}) and

            (

                any(self.data.get(k) for k in ('cost', 'recipe', 'recipes'))

                or 'mining-hardness' in self.data

                or self.title in {'Crude oil',

                                  'Water',

                                  'Space science pack',

                                  'Steam'}

            )

        )



    def get_recipes(self) -> Iterable:

        if self.recipes:

            for rates in self.recipes:

                fac = RecipeFactory(self, rates=rates)

                yield from fac.make()

        else:

            fac = RecipeFactory(self)

            yield from fac.make()



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                (float(self.mining_power) - mining_hardness)

                * float(self.mining_speed) / mining_time

        )





all_items: Dict[str, Item] = None





class ManualMiner:

    def __init__(self, tool: Item):

        self.tool = tool

        self.title = f'Manual with {tool}'

        self.pollution = 0

        self.dimensions = '0×0'



    def __str__(self) -> str:

        return self.title



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                0.6 * (float(self.tool.mining_power) - mining_hardness)

                / mining_time

        )





class Recipe:

    def __init__(self, resource: str, producer: Item, rates: dict,

                 title: str = None):

        self.resource = resource

        if title:

            self.title = title

        else:

            self.title = f'{resource} ({producer})'



        self.rates = dict(rates)

        self.producer = producer

        self.multiply_producer(producer)



    def __str__(self) -> str:

        return self.title



    def multiply_producer(self, prod: Item):

        if prod.title in {'Boiler', 'Heat exchanger', 'Solar panel',

                          'Steam engine', 'Steam turbine'}:

            pass  # no crafting rate modifier

        elif prod.title == 'Nuclear reactor':

            self.rates['Heat'] = parse_power(prod.energy)

        else:

            rate = float(prod.crafting_speed)

            for k in self.rates:

                self.rates[k] *= rate





class MiningRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 mining_hardness: float, mining_time: float, title: str = ''):

        self.mining_hardness, self.mining_time = mining_hardness, mining_time

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, miner: Item):

        self.rates[self.resource] = self.producer.mine_rate(

            self.mining_hardness, self.mining_time

        )

        if self.resource == 'Uranium ore':

            self.rates['Sulphuric acid'] = -self.rates[self.resource]





class TechRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 cost_multiplier: float, title: str = ''):

        self.cost_multiplier = cost_multiplier

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, lab: Item):

        self.rates[self.resource] /= self.cost_multiplier





class FluidRecipe(Recipe):

    # Pumpjacks, offshore pumps

    def multiply_producer(self, producer: Item):

        if producer.title == 'Pumpjack':

            yield_factor = 1.00  # Assumed

            rate = 10*yield_factor

        elif producer.title == 'Offshore pump':

            rate = 1200

        else:

            raise NotImplementedError()

        self.rates[self.resource] = rate





class RecipeFactory:

    def __init__(self, resource: Item, rates: dict = None):

        self.resource = resource

        self.producers = ()

        if rates:

            self.producers, self.title, self.rates = self.intermediate(rates)

        else:

            self.title = None

            needs_producers = False

            recipe = resource.recipe or resource.cost

            if recipe:

                self.rates = self.parse_recipe(recipe)

                if resource.prototype_type == 'technology':

                    self.producers = (all_items['lab'],)

                else:

                    needs_producers = True

            else:

                if resource.mining_time or 

                        resource.title in {'Crude oil', 'Water'}:

                    self.rates = {}

                    if resource.title != 'Raw wood':

                        needs_producers = True

                else:

                    raise NotImplementedError()

            if needs_producers:

                self.producers = tuple(parse_producers(resource.producers))



    def __str__(self) -> str:

        return self.title



    def intermediate(self, rates) -> (Iterable[Item], str, dict):

        building = rates.get('building')

        if building:

            producers = (all_items[building.lower()],)

        else:

            producers = parse_producers(self.resource.producers)

        title = rates['process']

        sane_rates = self.calc_recipe(rates['inputs'], rates['outputs'])

        return producers, title, sane_rates



    @staticmethod

    def parse_side(s: str) -> Dict[str, float]:

        out = {}

        for pair in s.split('+'):

            k, v = pair.split(',')

            out[k.strip()] = float(v.strip())

        return out



    @staticmethod

    def calc_recipe(inputs: Dict[str, float],

                    outputs: Dict[str, float]) -> Dict[str, float]:

        rates = defaultdict(float, outputs)

        if 'time' in inputs:

            k = 'time'

        else:

            k = 'Time'

        t = inputs.pop(k)

        for k in rates:

            rates[k] /= t

        for k, v in inputs.items():

            rates[k] -= v / t

        return rates



    def parse_recipe(self, recipe: str) -> Dict[str, float]:

        if '=' in recipe:

            inputs, outputs = recipe.split('=')

            outputs = self.parse_side(outputs)

        else:

            inputs = recipe

            outputs = {self.resource.title: 1}



        return self.calc_recipe(self.parse_side(inputs), outputs)



    def produce(self, cls, producer, **kwargs):

        kwargs.setdefault('title', self.title)

        recipe = cls(self.resource.title, producer, self.rates, **kwargs)

        if producer.pollution:

            recipe.rates['Pollution'] = float(producer.pollution)



        dims = tuple(float(x) for x in producer.dimensions.split('×'))

        recipe.rates['Area'] = dims[0] * dims[1]



        return recipe



    def for_energy(self, cls, **kwargs) -> Iterable[Recipe]:

        for producer in self.producers:

            energy = -parse_power(producer.energy)



            if 'electric' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Energy'] = energy

                yield recipe



            elif 'heat' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Heat'] = energy

                yield recipe



            elif 'burner' in producer.energy:

                for fuel_name in producer.valid_fuel.split('+'):

                    fuel_name = fuel_name.strip().lower()

                    fuel = all_items[fuel_name]

                    fuel_value = parse_power(fuel.fuel_value)

                    new_kwargs = dict(kwargs)

                    if self.title:

                        title = self.title

                    else:

                        title = f'{self.resource} ({producer})'

                    new_kwargs['title'] = f'{title} fueled by {fuel_name}'



                    recipe = self.produce(cls, producer, **new_kwargs)

                    recipe.rates[fuel.title] = energy / fuel_value

                    yield recipe

            else:

                raise NotImplementedError()



    tree_re = re.compile(r'(d+) .*?|([^}|]+)}')



    def wood_mining(self) -> Iterable[MiningRecipe]:

        miners = tuple(

            ManualMiner(tool)

            for tool in all_items.values()

            if tool.prototype_type == 'mining-tool'

        )

        for m in self.tree_re.finditer(self.resource.mining_time):

            mining_time, source = int(m[1]), m[2]

            for miner in miners:

                yield self.produce(

                    MiningRecipe, miner,

                    mining_hardness=float(self.resource.mining_hardness),

                    mining_time=mining_time,

                    title=f'{self.resource} ({miner} from {source})')



    def make(self) -> Iterable[Recipe]:

        if self.rates:

            if self.resource.prototype_type == 'technology':

                yield self.produce(

                    TechRecipe, self.producers[0],

                    cost_multiplier=float(self.resource.cost_multiplier))

            elif self.resource.title == 'Energy':

                yield self.produce(Recipe, self.producers[0])

            else:

                yield from self.for_energy(Recipe)

        elif self.resource.title == 'Raw wood':

            yield from self.wood_mining()

        elif self.resource.mining_time:

            yield from self.for_energy(

                MiningRecipe,

                mining_hardness=float(self.resource.mining_hardness),

                mining_time=float(self.resource.mining_time))

        elif self.resource.title == 'Crude oil':

            yield from self.for_energy(FluidRecipe)

        elif self.resource.title == 'Water':

            yield self.produce(FluidRecipe, self.producers[0])

        else:

            raise NotImplementedError()





def parse_power(s: str) -> float:

    m = power_re.search(s)

    return float(m[1]) * si_facs[m[2]]





def items_of_type(t: str) -> Iterable[Item]:

    return (i for i in all_items.values()

            if i.prototype_type == t)





barrel_re = re.compile(r'empty .+ barrel')





def parse_producers(s: str) -> Iterable[Item]:

    for p in s.split('+'):

        p = p.strip().lower()

        if p == 'furnace':

            yield from items_of_type('furnace')

        elif p == 'assembling machine':

            yield from (all_items[f'assembling machine {i}']

                        for i in range(1, 4))

        elif p == 'mining drill':

            yield from (all_items[f'{t} mining drill']

                        for t in ('burner', 'electric'))

        elif p == 'manual' or barrel_re.match(p):

            continue

        else:

            yield all_items[p]





def trim(items: dict):

    to_delete = tuple(k for k, v in items.items() if not v.keep)

    print(f'Dropping {len(to_delete)} items...')

    for k in to_delete:

        del items[k]





def energy_data() -> dict:

    solar_ave = parse_power(next(

        s for s in all_items['solar panel'].power_output.split('<br/>')

        if 'average' in s))



    eng = all_items['steam engine']

    eng_rate = float(eng.fluid_consumption

                     .split('/')[0])

    eng_power = parse_power(eng.power_output)



    turbine = all_items['steam turbine']

    turbine_rate = float(turbine.fluid_consumption

                         .split('/')[0])

    turbine_power_500 = 5.82e6  # ignore non-precise data and use this instead

    turbine_power_165 = 1.8e6   # from wiki page body



    return {

        'title': 'Energy',

        'recipes': (

            {

                'building': 'Solar panel',

                'process': 'Energy (Solar panel)',

                'inputs': {

                    'Time': 1

                },

                'outputs': {

                    'Energy': solar_ave

                }

            },

            {

                'building': 'Steam engine',

                'process': 'Energy (Steam engine)',

                'inputs': {

                    'Time': 1,

                    'Steam165': eng_rate

                },

                'outputs': {

                    'Energy': eng_power

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 165C)',

                'inputs': {

                    'Time': 1,

                    'Steam165': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_165

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 500C)',

                'inputs': {

                    'Time': 1,

                    'Steam500': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_500

                }

            }

        )

    }





def load(fn: str):

    with lzma.open(fn) as f:

        global all_items

        all_items = {k.lower(): Item(d) for k, d in json.load(f).items()}

    all_items['energy'] = Item(energy_data())





def get_recipes() -> (Dict[str, Recipe], Set[str]):

    recipes = {}

    resources = set()

    for item in all_items.values():

        item_recipes = tuple(item.get_recipes())

        recipes.update({i.title: i for i in item_recipes})

        for recipe in item_recipes:

            resources.update(recipe.rates.keys())



    return recipes, resources





def field_size(names: Iterable) -> int:

    return max(len(str(o)) for o in names)





def write_csv_for_r(recipes: Sequence[Recipe], resources: Sequence[str],

                    fn: str):

    # Recipes going down, resources going right



    rec_width = field_size(recipes)

    float_width = 15

    col_format = f'{{:{float_width+8}}}'

    rec_format = 'n{:' + str(rec_width+1) + '}'



    with lzma.open(fn, 'wt') as f:

        f.write(' '*(rec_width+1))

        for res in resources:

            f.write(col_format.format(f'{res},'))



        for rec in recipes:

            f.write(rec_format.format(f'{rec},'))

            for res in resources:

                x = rec.rates.get(res, 0)

                col_format = f'{{:+{len(res)}.{float_width}e}},'

                f.write(col_format.format(x))





def write_for_numpy(recipes: Sequence[Recipe], resources: Sequence[str],

                    meta_fn: str, npz_fn: str):

    rec_names = [r.title for r in recipes]

    w_rec = max(len(r) for r in rec_names)

    recipe_names = np.array(rec_names, copy=False, dtype=f'U{w_rec}')



    w_res = max(len(r) for r in resources)

    resource_names = np.array(resources, copy=False, dtype=f'U{w_res}')



    np.savez_compressed(meta_fn, recipe_names=recipe_names, resource_names=resource_names)



    rec_mat = lil_matrix((len(resources), len(recipes)))

    for j, rec in enumerate(recipes):

        for res, q in rec.rates.items():

            i = resources.index(res)

            rec_mat[i, j] = q

    save_npz(npz_fn, rec_mat.tocsr())





def file_banner(fn):

    print(f'{fn} {getsize(fn)//1024} kiB')





def main():

    fn = 'items.json.xz'

    print(f'Loading {fn}... ', end='')

    load(fn)

    print(f'{len(all_items)} items')



    trim(all_items)



    print('Calculating recipes... ', end='')

    recipes, resources = get_recipes()

    print(f'{len(recipes)} recipes, {len(resources)} resources')



    resources = sorted(resources)

    recipes = sorted(recipes.values(), key=lambda i: i.title)



    print('Saving files for numpy...')

    meta_fn, npz_fn = 'recipe-names.npz', 'recipes.npz'

    write_for_numpy(recipes, resources, meta_fn, npz_fn)

    file_banner(meta_fn)

    file_banner(npz_fn)



    fn = 'recipes.csv.xz'

    print(f'Saving recipes for use by R...')

    stdout.flush()

    write_csv_for_r(recipes, resources, fn)

    file_banner(fn)





if __name__ == '__main__':

    main()

That's followed by an analysis script that I won't post here, to constrain the scope of this first review.

asked 18 mins ago

Reinderien

5,280926

add a comment |

This project is... a little ridiculous. It's working, but it's a complete mess.

Data about Factorio's game economy are pulled from the wiki via the MediaWiki API, scrubbed, preprocessed, and thrown into Scipy for linear programming analysis using the MOSEK interior point method.

The pull script only depends on requests:

#!/usr/bin/env python3



import json, lzma, re

from os.path import getsize

from requests import Session

from sys import stdout



session = Session()





def get_mediawiki(content=False, progress=None, **kwargs):

    """

    https://stable.wiki.factorio.com is an instance of MediaWiki.

    The API endpoint is

    https://stable.wiki.factorio.com/api.php

    """

    params = {'action': 'query',

              'format': 'json',

              **kwargs}

    if content:

        params.update({'prop': 'revisions',

                       'rvprop': 'content'})

    so_far = 0

    while True:

        resp = session.get('https://stable.wiki.factorio.com/api.php',

                           params=params)

        resp.raise_for_status()



        doc = resp.json()

        pages = doc['query']['pages'].values()

        if content:

            full_pages = tuple(p for p in pages if 'revisions' in p)

            if progress:

                so_far += len(full_pages)

                progress(so_far, len(pages))

            yield from full_pages

        else:

            yield from pages



        if 'batchcomplete' in doc:

            break

        params.update(doc['continue'])





def get_category(name, content=False, progress=None, **kwargs):

    return get_mediawiki(content=content, progress=progress,

                         generator='categorymembers',

                         gcmtitle=f'Category:{name}',

                         gcmtype='page',

                         gcmlimit=500,

                         **kwargs)





def get_archived_titles():

    return get_category('Archived')





def get_infoboxes(progress):

    return get_category('Infobox_page', content=True, progress=progress)





def get_inter_tables(titles, progress):

    return get_mediawiki(content=True, progress=progress,

                         titles='|'.join(titles))





line_re = re.compile(r'ns*|')

var_re = re.compile(

    r'^s*'

    r'(S+)'

    r's*=s*'

    r'(.+?)'

    r's*$')





def parse_infobox(page):

    """

    Example:



    {{Infobox

    |map-color = 006090

    |prototype-type = mining-drill

    |internal-name = burner-mining-drill

    |expensive-total-raw = Time, 8 + Iron plate, 30 + Stone, 10

    |expensive-recipe = Time, 4 + Iron gear wheel, 6 + Iron plate, 6 + Stone furnace, 2

    |category = Production

    |image=Burner-Mining-Drill-Example

    |health = 150

    |stack-size=50

    |dimensions=2×2

    |energy=300 {{Translation|kW}} burner

    |mining-power=2.5

    |mining-speed=0.35

    |mining-area=2×2

    |pollution=10

    |valid-fuel = Wood + Raw wood + Wooden chest + Coal + Solid fuel + Small electric pole + Rocket fuel + Nuclear fuel

    |recipe = Time, 2 + Iron gear wheel, 3 + Iron plate, 3 + Stone furnace, 1

    |total-raw = Time, 4 + Iron plate, 9 + Stone, 5

    |producers=Manual + Assembling machine 2 + Assembling machine 3

    }}<noinclude>

    [[Category:Infobox page]]

    </noinclude>



    Splitting on newline isn't a great idea, because

    https://www.mediawiki.org/wiki/Help:Templates#Named_parameters

    shows that only the pipe is mandatory as a separator. However, only

    splitting on pipe is worse, because there are pipes on the inside of links.

    """



    content = page['revisions'][0]['*']

    entries = (

        var_re.match(e)

        for e in line_re.split(

            content.split('{{', maxsplit=1)[1]

            .rsplit('}}', maxsplit=1)[0]

        )

    )

    title = page['title'].split(':', maxsplit=1)[1]

    d = {'pageid': page['pageid'],

         'title': title}

    d.update(dict(e.groups() for e in entries if e))

    return d





part_tok = r's*([^|{}]*?)'

border_tok = r's*|'

row_image_re = re.compile(

    r'{{s*'

    r'(?P<type>w+)'

    f'{border_tok}'

    f'{part_tok}'

    r'(?:'

       f'{border_tok}'    

       f'{part_tok}'

    r')?'

    r'(?:'

       f'{border_tok}'

       r'[^{}]*'

    r')?'

    r'}}s*'

    r'(?P<sep>'

      r'(?:'

        r'|||+|→'

      r')?'

    r')',

)





def iter_cells(row):

    """

    e.g.

    | {{Icon|Solid fuel from light oil||}}

    || {{icon|Light oil|10}} + {{icon|time|3}}

    || {{icon|Solid fuel|1}}

    or

    | {{Imagelink|Oil refinery}}

    || {{Imagelink|Basic oil processing}}

    || {{Icon|Crude oil|100}} + {{icon|Time|5}}

    → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    """



    cell = 

    for m in row_image_re.finditer(row):

        if m.group('sep') == '||':

            cell.append(m.groups()[:-1])

            yield cell

            cell = 

        else:

            cell.append(m.groups())

    if cell:

        yield cell





def parse_inter_table(page):

    """

    Example:



    {| class="wikitable"

    ! Building !! Process !! Results

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Basic oil processing}} || {{Icon|Crude oil|100}} + {{icon|Time|5}} → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Advanced oil processing}} || {{Icon|Crude oil|100}} + {{icon|Water|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|10}} + ({{Icon|Light oil|45}} {{Icon|Petroleum gas|55}})

    |-

    | {{Imagelink|Oil refinery}} || {{imagelink|Coal liquefaction}} || {{icon|Coal|10}} + {{Icon|Heavy oil|25}} + {{icon|Steam|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|35}} + ({{Icon|Light oil|15}} + {{Icon|Petroleum gas|20}})

    |}



    or



    {| class="wikitable"

    ! Process !! Input !! Output

    |-

    | {{Icon|Solid fuel from heavy oil||}} || {{icon|Heavy oil|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from light oil||}} || {{icon|Light oil|10}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from petroleum gas||}} || {{icon|Petroleum gas|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    |}

    """

    title = page['title']

    content = page['revisions'][0]['*']

    if '{|' not in content:

        return title, {}



    rows = 

    body = (content

            .replace('n', '')

            .split('{|', maxsplit=1)[1]

            .rsplit('|}', maxsplit=1)[0])

    row_strings = body.split('|-')

    heads = tuple(h.strip().lower() for h in row_strings[0]

                  .split('!', maxsplit=1)[1]

                  .split('!!'))



    for line in row_strings[1:]:

        inputs = {}

        outputs = {}

        row = {'inputs': inputs, 'outputs': outputs}

        for head, parts in zip(heads, iter_cells(line)):

            if head in ('process', 'building'):

                row[head.lower()] = parts[0][1]

                continue

            elif head not in ('input', 'output', 'results'):

                if head == '':

                    return title, {}  # Space science pack edge case

                raise ValueError(f'Unrecognized head {head}')



            if 'input' in head:

                side = inputs

            elif 'output' in head:

                side = outputs

            else:

                side = inputs

                if 'results' not in head:

                    raise ValueError(f'Unexpected heading {head}')

            for part in parts:

                res_type = part[0].lower()

                if res_type != 'icon':

                    raise ValueError(f'Unexpected resource type {res_type}')

                side[part[1]] = int(part[2])

                if 'results' in head and len(part) == 4 and part[-1] == '→':

                    side = outputs



        if inputs or outputs:

            rows.append(row)



    return title, {'recipes': rows}





def inter_needed(items):

    return (i['title'] for i in items if

            not i['archived']

            and i.get('category') == 'Intermediate products'

            and not ('cost' in i or 'recipe' in i))





def save(fn, recipes):

    with lzma.open(fn, 'wt') as f:

        json.dump(recipes, f, indent=4)





def main():

    def progress(so_far, total):

        print(f'{so_far}/{total} {so_far/total:.0%}', end='r')

        stdout.flush()



    print('Getting archived items... ', end='')

    archived_titles = {p['title'] for p in get_archived_titles()}

    print(len(archived_titles))



    print('Getting item content...')

    items = tuple(parse_infobox(p) for p in get_infoboxes(progress))

    items_by_name = {i['title']: i for i in items}

    for item in items:

        item['archived'] = item['title'] in archived_titles



    print('nFilling in intermediate products...')

    inter_tables = get_inter_tables(inter_needed(items), progress)

    used = 0

    for table_page in inter_tables:

        try:

            title, recipes = parse_inter_table(table_page)

            if recipes:

                used += 1

                items_by_name[title].update(recipes)

        except Exception as e:

            print(f'nWarning: {table_page["title"]} failed to parse - {e}')

    print(f'n{used} intermediate tables used.')



    fn = 'items.json.xz'

    print(f'Saving to {fn}... ', end='')

    save(fn, items_by_name)

    print(f'{getsize(fn)//1024} kiB')





if __name__ == '__main__':

    main()

You need to run it before any of the next steps. After the data are pulled, run the preprocessing script:

#!/usr/bin/env python3



import json, lzma, re

import numpy as np

from collections import defaultdict

from os.path import getsize

from scipy.sparse import lil_matrix, save_npz

from sys import stdout

from typing import Dict, Iterable, Set, Sequence





power_re = re.compile(r'([0-9.]+) .*([kMG])[WJ]')



si_facs = {

    c: 10**(3*i) for i, c in enumerate(('', 'k', 'M', 'G'))

}





class Item:

    def __init__(self, data: dict):

        self.data = data

        (

            self.archived,

            self.cost,

            self.cost_multiplier,

            self.crafting_speed,

            self.dimensions,

            self.energy,

            self.fluid_consumption,

            self.fuel_value,

            self.mining_hardness,

            self.mining_power,

            self.mining_speed,

            self.mining_time,

            self.pollution,

            self.power_output,

            self.producers,

            self.prototype_type,

            self.recipe,

            self.recipes,

            self.title,

            self.valid_fuel

        ) = (None,)*20

        self.__dict__.update({k.replace('-', '_'): v

                              for k, v in data.items()})

        self.fill_gaps()



    def fill_gaps(self):

        if self.prototype_type == 'technology':

            self.producers = 'Lab'

        elif self.title in ('Flamethrower turret', 'Gun turret',

                            'Laser turret'):

            self.producers = 'Assembling machine + manual'

        elif self.title == 'Space science pack':

            self.recipe = 'Time, 41.25 + Rocket part, 100 = ' 

                          'Space science pack, 1000'

        elif self.title == 'Steam':

            ex_rate = 10e6 * 60 / 5.82e6

            self.recipes = (

                {

                    'process': 'Steam165 (Boiler)',

                    'building': 'Boiler',

                    'inputs': {

                        'Water': 60,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam165': 60

                    }

                },

                {

                    'process': 'Steam500 (Heat exchanger)',

                    'building': 'Heat exchanger',

                    'inputs': {

                        'Water': ex_rate,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam500': ex_rate

                    }

                }

            )



    def __str__(self) -> str:

        return self.title



    @property

    def keep(self) -> bool:

        return (

            (not self.archived) and

            (self.title not in {'Rock', 'Tree'}) and

            (

                any(self.data.get(k) for k in ('cost', 'recipe', 'recipes'))

                or 'mining-hardness' in self.data

                or self.title in {'Crude oil',

                                  'Water',

                                  'Space science pack',

                                  'Steam'}

            )

        )



    def get_recipes(self) -> Iterable:

        if self.recipes:

            for rates in self.recipes:

                fac = RecipeFactory(self, rates=rates)

                yield from fac.make()

        else:

            fac = RecipeFactory(self)

            yield from fac.make()



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                (float(self.mining_power) - mining_hardness)

                * float(self.mining_speed) / mining_time

        )





all_items: Dict[str, Item] = None





class ManualMiner:

    def __init__(self, tool: Item):

        self.tool = tool

        self.title = f'Manual with {tool}'

        self.pollution = 0

        self.dimensions = '0×0'



    def __str__(self) -> str:

        return self.title



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                0.6 * (float(self.tool.mining_power) - mining_hardness)

                / mining_time

        )





class Recipe:

    def __init__(self, resource: str, producer: Item, rates: dict,

                 title: str = None):

        self.resource = resource

        if title:

            self.title = title

        else:

            self.title = f'{resource} ({producer})'



        self.rates = dict(rates)

        self.producer = producer

        self.multiply_producer(producer)



    def __str__(self) -> str:

        return self.title



    def multiply_producer(self, prod: Item):

        if prod.title in {'Boiler', 'Heat exchanger', 'Solar panel',

                          'Steam engine', 'Steam turbine'}:

            pass  # no crafting rate modifier

        elif prod.title == 'Nuclear reactor':

            self.rates['Heat'] = parse_power(prod.energy)

        else:

            rate = float(prod.crafting_speed)

            for k in self.rates:

                self.rates[k] *= rate





class MiningRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 mining_hardness: float, mining_time: float, title: str = ''):

        self.mining_hardness, self.mining_time = mining_hardness, mining_time

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, miner: Item):

        self.rates[self.resource] = self.producer.mine_rate(

            self.mining_hardness, self.mining_time

        )

        if self.resource == 'Uranium ore':

            self.rates['Sulphuric acid'] = -self.rates[self.resource]





class TechRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 cost_multiplier: float, title: str = ''):

        self.cost_multiplier = cost_multiplier

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, lab: Item):

        self.rates[self.resource] /= self.cost_multiplier





class FluidRecipe(Recipe):

    # Pumpjacks, offshore pumps

    def multiply_producer(self, producer: Item):

        if producer.title == 'Pumpjack':

            yield_factor = 1.00  # Assumed

            rate = 10*yield_factor

        elif producer.title == 'Offshore pump':

            rate = 1200

        else:

            raise NotImplementedError()

        self.rates[self.resource] = rate





class RecipeFactory:

    def __init__(self, resource: Item, rates: dict = None):

        self.resource = resource

        self.producers = ()

        if rates:

            self.producers, self.title, self.rates = self.intermediate(rates)

        else:

            self.title = None

            needs_producers = False

            recipe = resource.recipe or resource.cost

            if recipe:

                self.rates = self.parse_recipe(recipe)

                if resource.prototype_type == 'technology':

                    self.producers = (all_items['lab'],)

                else:

                    needs_producers = True

            else:

                if resource.mining_time or 

                        resource.title in {'Crude oil', 'Water'}:

                    self.rates = {}

                    if resource.title != 'Raw wood':

                        needs_producers = True

                else:

                    raise NotImplementedError()

            if needs_producers:

                self.producers = tuple(parse_producers(resource.producers))



    def __str__(self) -> str:

        return self.title



    def intermediate(self, rates) -> (Iterable[Item], str, dict):

        building = rates.get('building')

        if building:

            producers = (all_items[building.lower()],)

        else:

            producers = parse_producers(self.resource.producers)

        title = rates['process']

        sane_rates = self.calc_recipe(rates['inputs'], rates['outputs'])

        return producers, title, sane_rates



    @staticmethod

    def parse_side(s: str) -> Dict[str, float]:

        out = {}

        for pair in s.split('+'):

            k, v = pair.split(',')

            out[k.strip()] = float(v.strip())

        return out



    @staticmethod

    def calc_recipe(inputs: Dict[str, float],

                    outputs: Dict[str, float]) -> Dict[str, float]:

        rates = defaultdict(float, outputs)

        if 'time' in inputs:

            k = 'time'

        else:

            k = 'Time'

        t = inputs.pop(k)

        for k in rates:

            rates[k] /= t

        for k, v in inputs.items():

            rates[k] -= v / t

        return rates



    def parse_recipe(self, recipe: str) -> Dict[str, float]:

        if '=' in recipe:

            inputs, outputs = recipe.split('=')

            outputs = self.parse_side(outputs)

        else:

            inputs = recipe

            outputs = {self.resource.title: 1}



        return self.calc_recipe(self.parse_side(inputs), outputs)



    def produce(self, cls, producer, **kwargs):

        kwargs.setdefault('title', self.title)

        recipe = cls(self.resource.title, producer, self.rates, **kwargs)

        if producer.pollution:

            recipe.rates['Pollution'] = float(producer.pollution)



        dims = tuple(float(x) for x in producer.dimensions.split('×'))

        recipe.rates['Area'] = dims[0] * dims[1]



        return recipe



    def for_energy(self, cls, **kwargs) -> Iterable[Recipe]:

        for producer in self.producers:

            energy = -parse_power(producer.energy)



            if 'electric' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Energy'] = energy

                yield recipe



            elif 'heat' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Heat'] = energy

                yield recipe



            elif 'burner' in producer.energy:

                for fuel_name in producer.valid_fuel.split('+'):

                    fuel_name = fuel_name.strip().lower()

                    fuel = all_items[fuel_name]

                    fuel_value = parse_power(fuel.fuel_value)

                    new_kwargs = dict(kwargs)

                    if self.title:

                        title = self.title

                    else:

                        title = f'{self.resource} ({producer})'

                    new_kwargs['title'] = f'{title} fueled by {fuel_name}'



                    recipe = self.produce(cls, producer, **new_kwargs)

                    recipe.rates[fuel.title] = energy / fuel_value

                    yield recipe

            else:

                raise NotImplementedError()



    tree_re = re.compile(r'(d+) .*?|([^}|]+)}')



    def wood_mining(self) -> Iterable[MiningRecipe]:

        miners = tuple(

            ManualMiner(tool)

            for tool in all_items.values()

            if tool.prototype_type == 'mining-tool'

        )

        for m in self.tree_re.finditer(self.resource.mining_time):

            mining_time, source = int(m[1]), m[2]

            for miner in miners:

                yield self.produce(

                    MiningRecipe, miner,

                    mining_hardness=float(self.resource.mining_hardness),

                    mining_time=mining_time,

                    title=f'{self.resource} ({miner} from {source})')



    def make(self) -> Iterable[Recipe]:

        if self.rates:

            if self.resource.prototype_type == 'technology':

                yield self.produce(

                    TechRecipe, self.producers[0],

                    cost_multiplier=float(self.resource.cost_multiplier))

            elif self.resource.title == 'Energy':

                yield self.produce(Recipe, self.producers[0])

            else:

                yield from self.for_energy(Recipe)

        elif self.resource.title == 'Raw wood':

            yield from self.wood_mining()

        elif self.resource.mining_time:

            yield from self.for_energy(

                MiningRecipe,

                mining_hardness=float(self.resource.mining_hardness),

                mining_time=float(self.resource.mining_time))

        elif self.resource.title == 'Crude oil':

            yield from self.for_energy(FluidRecipe)

        elif self.resource.title == 'Water':

            yield self.produce(FluidRecipe, self.producers[0])

        else:

            raise NotImplementedError()





def parse_power(s: str) -> float:

    m = power_re.search(s)

    return float(m[1]) * si_facs[m[2]]





def items_of_type(t: str) -> Iterable[Item]:

    return (i for i in all_items.values()

            if i.prototype_type == t)





barrel_re = re.compile(r'empty .+ barrel')





def parse_producers(s: str) -> Iterable[Item]:

    for p in s.split('+'):

        p = p.strip().lower()

        if p == 'furnace':

            yield from items_of_type('furnace')

        elif p == 'assembling machine':

            yield from (all_items[f'assembling machine {i}']

                        for i in range(1, 4))

        elif p == 'mining drill':

            yield from (all_items[f'{t} mining drill']

                        for t in ('burner', 'electric'))

        elif p == 'manual' or barrel_re.match(p):

            continue

        else:

            yield all_items[p]





def trim(items: dict):

    to_delete = tuple(k for k, v in items.items() if not v.keep)

    print(f'Dropping {len(to_delete)} items...')

    for k in to_delete:

        del items[k]





def energy_data() -> dict:

    solar_ave = parse_power(next(

        s for s in all_items['solar panel'].power_output.split('<br/>')

        if 'average' in s))



    eng = all_items['steam engine']

    eng_rate = float(eng.fluid_consumption

                     .split('/')[0])

    eng_power = parse_power(eng.power_output)



    turbine = all_items['steam turbine']

    turbine_rate = float(turbine.fluid_consumption

                         .split('/')[0])

    turbine_power_500 = 5.82e6  # ignore non-precise data and use this instead

    turbine_power_165 = 1.8e6   # from wiki page body



    return {

        'title': 'Energy',

        'recipes': (

            {

                'building': 'Solar panel',

                'process': 'Energy (Solar panel)',

                'inputs': {

                    'Time': 1

                },

                'outputs': {

                    'Energy': solar_ave

                }

            },

            {

                'building': 'Steam engine',

                'process': 'Energy (Steam engine)',

                'inputs': {

                    'Time': 1,

                    'Steam165': eng_rate

                },

                'outputs': {

                    'Energy': eng_power

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 165C)',

                'inputs': {

                    'Time': 1,

                    'Steam165': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_165

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 500C)',

                'inputs': {

                    'Time': 1,

                    'Steam500': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_500

                }

            }

        )

    }





def load(fn: str):

    with lzma.open(fn) as f:

        global all_items

        all_items = {k.lower(): Item(d) for k, d in json.load(f).items()}

    all_items['energy'] = Item(energy_data())





def get_recipes() -> (Dict[str, Recipe], Set[str]):

    recipes = {}

    resources = set()

    for item in all_items.values():

        item_recipes = tuple(item.get_recipes())

        recipes.update({i.title: i for i in item_recipes})

        for recipe in item_recipes:

            resources.update(recipe.rates.keys())



    return recipes, resources





def field_size(names: Iterable) -> int:

    return max(len(str(o)) for o in names)





def write_csv_for_r(recipes: Sequence[Recipe], resources: Sequence[str],

                    fn: str):

    # Recipes going down, resources going right



    rec_width = field_size(recipes)

    float_width = 15

    col_format = f'{{:{float_width+8}}}'

    rec_format = 'n{:' + str(rec_width+1) + '}'



    with lzma.open(fn, 'wt') as f:

        f.write(' '*(rec_width+1))

        for res in resources:

            f.write(col_format.format(f'{res},'))



        for rec in recipes:

            f.write(rec_format.format(f'{rec},'))

            for res in resources:

                x = rec.rates.get(res, 0)

                col_format = f'{{:+{len(res)}.{float_width}e}},'

                f.write(col_format.format(x))





def write_for_numpy(recipes: Sequence[Recipe], resources: Sequence[str],

                    meta_fn: str, npz_fn: str):

    rec_names = [r.title for r in recipes]

    w_rec = max(len(r) for r in rec_names)

    recipe_names = np.array(rec_names, copy=False, dtype=f'U{w_rec}')



    w_res = max(len(r) for r in resources)

    resource_names = np.array(resources, copy=False, dtype=f'U{w_res}')



    np.savez_compressed(meta_fn, recipe_names=recipe_names, resource_names=resource_names)



    rec_mat = lil_matrix((len(resources), len(recipes)))

    for j, rec in enumerate(recipes):

        for res, q in rec.rates.items():

            i = resources.index(res)

            rec_mat[i, j] = q

    save_npz(npz_fn, rec_mat.tocsr())





def file_banner(fn):

    print(f'{fn} {getsize(fn)//1024} kiB')





def main():

    fn = 'items.json.xz'

    print(f'Loading {fn}... ', end='')

    load(fn)

    print(f'{len(all_items)} items')



    trim(all_items)



    print('Calculating recipes... ', end='')

    recipes, resources = get_recipes()

    print(f'{len(recipes)} recipes, {len(resources)} resources')



    resources = sorted(resources)

    recipes = sorted(recipes.values(), key=lambda i: i.title)



    print('Saving files for numpy...')

    meta_fn, npz_fn = 'recipe-names.npz', 'recipes.npz'

    write_for_numpy(recipes, resources, meta_fn, npz_fn)

    file_banner(meta_fn)

    file_banner(npz_fn)



    fn = 'recipes.csv.xz'

    print(f'Saving recipes for use by R...')

    stdout.flush()

    write_csv_for_r(recipes, resources, fn)

    file_banner(fn)





if __name__ == '__main__':

    main()

That's followed by an analysis script that I won't post here, to constrain the scope of this first review.

asked 18 mins ago

Reinderien

5,280926

This project is... a little ridiculous. It's working, but it's a complete mess.

Data about Factorio's game economy are pulled from the wiki via the MediaWiki API, scrubbed, preprocessed, and thrown into Scipy for linear programming analysis using the MOSEK interior point method.

The pull script only depends on requests:

#!/usr/bin/env python3



import json, lzma, re

from os.path import getsize

from requests import Session

from sys import stdout



session = Session()





def get_mediawiki(content=False, progress=None, **kwargs):

    """

    https://stable.wiki.factorio.com is an instance of MediaWiki.

    The API endpoint is

    https://stable.wiki.factorio.com/api.php

    """

    params = {'action': 'query',

              'format': 'json',

              **kwargs}

    if content:

        params.update({'prop': 'revisions',

                       'rvprop': 'content'})

    so_far = 0

    while True:

        resp = session.get('https://stable.wiki.factorio.com/api.php',

                           params=params)

        resp.raise_for_status()



        doc = resp.json()

        pages = doc['query']['pages'].values()

        if content:

            full_pages = tuple(p for p in pages if 'revisions' in p)

            if progress:

                so_far += len(full_pages)

                progress(so_far, len(pages))

            yield from full_pages

        else:

            yield from pages



        if 'batchcomplete' in doc:

            break

        params.update(doc['continue'])





def get_category(name, content=False, progress=None, **kwargs):

    return get_mediawiki(content=content, progress=progress,

                         generator='categorymembers',

                         gcmtitle=f'Category:{name}',

                         gcmtype='page',

                         gcmlimit=500,

                         **kwargs)





def get_archived_titles():

    return get_category('Archived')





def get_infoboxes(progress):

    return get_category('Infobox_page', content=True, progress=progress)





def get_inter_tables(titles, progress):

    return get_mediawiki(content=True, progress=progress,

                         titles='|'.join(titles))





line_re = re.compile(r'ns*|')

var_re = re.compile(

    r'^s*'

    r'(S+)'

    r's*=s*'

    r'(.+?)'

    r's*$')





def parse_infobox(page):

    """

    Example:



    {{Infobox

    |map-color = 006090

    |prototype-type = mining-drill

    |internal-name = burner-mining-drill

    |expensive-total-raw = Time, 8 + Iron plate, 30 + Stone, 10

    |expensive-recipe = Time, 4 + Iron gear wheel, 6 + Iron plate, 6 + Stone furnace, 2

    |category = Production

    |image=Burner-Mining-Drill-Example

    |health = 150

    |stack-size=50

    |dimensions=2×2

    |energy=300 {{Translation|kW}} burner

    |mining-power=2.5

    |mining-speed=0.35

    |mining-area=2×2

    |pollution=10

    |valid-fuel = Wood + Raw wood + Wooden chest + Coal + Solid fuel + Small electric pole + Rocket fuel + Nuclear fuel

    |recipe = Time, 2 + Iron gear wheel, 3 + Iron plate, 3 + Stone furnace, 1

    |total-raw = Time, 4 + Iron plate, 9 + Stone, 5

    |producers=Manual + Assembling machine 2 + Assembling machine 3

    }}<noinclude>

    [[Category:Infobox page]]

    </noinclude>



    Splitting on newline isn't a great idea, because

    https://www.mediawiki.org/wiki/Help:Templates#Named_parameters

    shows that only the pipe is mandatory as a separator. However, only

    splitting on pipe is worse, because there are pipes on the inside of links.

    """



    content = page['revisions'][0]['*']

    entries = (

        var_re.match(e)

        for e in line_re.split(

            content.split('{{', maxsplit=1)[1]

            .rsplit('}}', maxsplit=1)[0]

        )

    )

    title = page['title'].split(':', maxsplit=1)[1]

    d = {'pageid': page['pageid'],

         'title': title}

    d.update(dict(e.groups() for e in entries if e))

    return d





part_tok = r's*([^|{}]*?)'

border_tok = r's*|'

row_image_re = re.compile(

    r'{{s*'

    r'(?P<type>w+)'

    f'{border_tok}'

    f'{part_tok}'

    r'(?:'

       f'{border_tok}'    

       f'{part_tok}'

    r')?'

    r'(?:'

       f'{border_tok}'

       r'[^{}]*'

    r')?'

    r'}}s*'

    r'(?P<sep>'

      r'(?:'

        r'|||+|→'

      r')?'

    r')',

)





def iter_cells(row):

    """

    e.g.

    | {{Icon|Solid fuel from light oil||}}

    || {{icon|Light oil|10}} + {{icon|time|3}}

    || {{icon|Solid fuel|1}}

    or

    | {{Imagelink|Oil refinery}}

    || {{Imagelink|Basic oil processing}}

    || {{Icon|Crude oil|100}} + {{icon|Time|5}}

    → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    """



    cell = 

    for m in row_image_re.finditer(row):

        if m.group('sep') == '||':

            cell.append(m.groups()[:-1])

            yield cell

            cell = 

        else:

            cell.append(m.groups())

    if cell:

        yield cell





def parse_inter_table(page):

    """

    Example:



    {| class="wikitable"

    ! Building !! Process !! Results

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Basic oil processing}} || {{Icon|Crude oil|100}} + {{icon|Time|5}} → {{Icon|Heavy oil|30}} + ({{Icon|Light oil|30}} {{Icon|Petroleum gas|40}})

    |-

    | {{Imagelink|Oil refinery}} || {{Imagelink|Advanced oil processing}} || {{Icon|Crude oil|100}} + {{icon|Water|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|10}} + ({{Icon|Light oil|45}} {{Icon|Petroleum gas|55}})

    |-

    | {{Imagelink|Oil refinery}} || {{imagelink|Coal liquefaction}} || {{icon|Coal|10}} + {{Icon|Heavy oil|25}} + {{icon|Steam|50}} + {{icon|Time|5}} → {{Icon|Heavy oil|35}} + ({{Icon|Light oil|15}} + {{Icon|Petroleum gas|20}})

    |}



    or



    {| class="wikitable"

    ! Process !! Input !! Output

    |-

    | {{Icon|Solid fuel from heavy oil||}} || {{icon|Heavy oil|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from light oil||}} || {{icon|Light oil|10}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    | {{Icon|Solid fuel from petroleum gas||}} || {{icon|Petroleum gas|20}} + {{icon|time|3}} || {{icon|Solid fuel|1}}

    |-

    |}

    """

    title = page['title']

    content = page['revisions'][0]['*']

    if '{|' not in content:

        return title, {}



    rows = 

    body = (content

            .replace('n', '')

            .split('{|', maxsplit=1)[1]

            .rsplit('|}', maxsplit=1)[0])

    row_strings = body.split('|-')

    heads = tuple(h.strip().lower() for h in row_strings[0]

                  .split('!', maxsplit=1)[1]

                  .split('!!'))



    for line in row_strings[1:]:

        inputs = {}

        outputs = {}

        row = {'inputs': inputs, 'outputs': outputs}

        for head, parts in zip(heads, iter_cells(line)):

            if head in ('process', 'building'):

                row[head.lower()] = parts[0][1]

                continue

            elif head not in ('input', 'output', 'results'):

                if head == '':

                    return title, {}  # Space science pack edge case

                raise ValueError(f'Unrecognized head {head}')



            if 'input' in head:

                side = inputs

            elif 'output' in head:

                side = outputs

            else:

                side = inputs

                if 'results' not in head:

                    raise ValueError(f'Unexpected heading {head}')

            for part in parts:

                res_type = part[0].lower()

                if res_type != 'icon':

                    raise ValueError(f'Unexpected resource type {res_type}')

                side[part[1]] = int(part[2])

                if 'results' in head and len(part) == 4 and part[-1] == '→':

                    side = outputs



        if inputs or outputs:

            rows.append(row)



    return title, {'recipes': rows}





def inter_needed(items):

    return (i['title'] for i in items if

            not i['archived']

            and i.get('category') == 'Intermediate products'

            and not ('cost' in i or 'recipe' in i))





def save(fn, recipes):

    with lzma.open(fn, 'wt') as f:

        json.dump(recipes, f, indent=4)





def main():

    def progress(so_far, total):

        print(f'{so_far}/{total} {so_far/total:.0%}', end='r')

        stdout.flush()



    print('Getting archived items... ', end='')

    archived_titles = {p['title'] for p in get_archived_titles()}

    print(len(archived_titles))



    print('Getting item content...')

    items = tuple(parse_infobox(p) for p in get_infoboxes(progress))

    items_by_name = {i['title']: i for i in items}

    for item in items:

        item['archived'] = item['title'] in archived_titles



    print('nFilling in intermediate products...')

    inter_tables = get_inter_tables(inter_needed(items), progress)

    used = 0

    for table_page in inter_tables:

        try:

            title, recipes = parse_inter_table(table_page)

            if recipes:

                used += 1

                items_by_name[title].update(recipes)

        except Exception as e:

            print(f'nWarning: {table_page["title"]} failed to parse - {e}')

    print(f'n{used} intermediate tables used.')



    fn = 'items.json.xz'

    print(f'Saving to {fn}... ', end='')

    save(fn, items_by_name)

    print(f'{getsize(fn)//1024} kiB')





if __name__ == '__main__':

    main()

You need to run it before any of the next steps. After the data are pulled, run the preprocessing script:

#!/usr/bin/env python3



import json, lzma, re

import numpy as np

from collections import defaultdict

from os.path import getsize

from scipy.sparse import lil_matrix, save_npz

from sys import stdout

from typing import Dict, Iterable, Set, Sequence





power_re = re.compile(r'([0-9.]+) .*([kMG])[WJ]')



si_facs = {

    c: 10**(3*i) for i, c in enumerate(('', 'k', 'M', 'G'))

}





class Item:

    def __init__(self, data: dict):

        self.data = data

        (

            self.archived,

            self.cost,

            self.cost_multiplier,

            self.crafting_speed,

            self.dimensions,

            self.energy,

            self.fluid_consumption,

            self.fuel_value,

            self.mining_hardness,

            self.mining_power,

            self.mining_speed,

            self.mining_time,

            self.pollution,

            self.power_output,

            self.producers,

            self.prototype_type,

            self.recipe,

            self.recipes,

            self.title,

            self.valid_fuel

        ) = (None,)*20

        self.__dict__.update({k.replace('-', '_'): v

                              for k, v in data.items()})

        self.fill_gaps()



    def fill_gaps(self):

        if self.prototype_type == 'technology':

            self.producers = 'Lab'

        elif self.title in ('Flamethrower turret', 'Gun turret',

                            'Laser turret'):

            self.producers = 'Assembling machine + manual'

        elif self.title == 'Space science pack':

            self.recipe = 'Time, 41.25 + Rocket part, 100 = ' 

                          'Space science pack, 1000'

        elif self.title == 'Steam':

            ex_rate = 10e6 * 60 / 5.82e6

            self.recipes = (

                {

                    'process': 'Steam165 (Boiler)',

                    'building': 'Boiler',

                    'inputs': {

                        'Water': 60,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam165': 60

                    }

                },

                {

                    'process': 'Steam500 (Heat exchanger)',

                    'building': 'Heat exchanger',

                    'inputs': {

                        'Water': ex_rate,

                        'Time': 1

                    },

                    'outputs': {

                        'Steam500': ex_rate

                    }

                }

            )



    def __str__(self) -> str:

        return self.title



    @property

    def keep(self) -> bool:

        return (

            (not self.archived) and

            (self.title not in {'Rock', 'Tree'}) and

            (

                any(self.data.get(k) for k in ('cost', 'recipe', 'recipes'))

                or 'mining-hardness' in self.data

                or self.title in {'Crude oil',

                                  'Water',

                                  'Space science pack',

                                  'Steam'}

            )

        )



    def get_recipes(self) -> Iterable:

        if self.recipes:

            for rates in self.recipes:

                fac = RecipeFactory(self, rates=rates)

                yield from fac.make()

        else:

            fac = RecipeFactory(self)

            yield from fac.make()



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                (float(self.mining_power) - mining_hardness)

                * float(self.mining_speed) / mining_time

        )





all_items: Dict[str, Item] = None





class ManualMiner:

    def __init__(self, tool: Item):

        self.tool = tool

        self.title = f'Manual with {tool}'

        self.pollution = 0

        self.dimensions = '0×0'



    def __str__(self) -> str:

        return self.title



    def mine_rate(self, mining_hardness: float, mining_time: float) -> float:

        return (

                0.6 * (float(self.tool.mining_power) - mining_hardness)

                / mining_time

        )





class Recipe:

    def __init__(self, resource: str, producer: Item, rates: dict,

                 title: str = None):

        self.resource = resource

        if title:

            self.title = title

        else:

            self.title = f'{resource} ({producer})'



        self.rates = dict(rates)

        self.producer = producer

        self.multiply_producer(producer)



    def __str__(self) -> str:

        return self.title



    def multiply_producer(self, prod: Item):

        if prod.title in {'Boiler', 'Heat exchanger', 'Solar panel',

                          'Steam engine', 'Steam turbine'}:

            pass  # no crafting rate modifier

        elif prod.title == 'Nuclear reactor':

            self.rates['Heat'] = parse_power(prod.energy)

        else:

            rate = float(prod.crafting_speed)

            for k in self.rates:

                self.rates[k] *= rate





class MiningRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 mining_hardness: float, mining_time: float, title: str = ''):

        self.mining_hardness, self.mining_time = mining_hardness, mining_time

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, miner: Item):

        self.rates[self.resource] = self.producer.mine_rate(

            self.mining_hardness, self.mining_time

        )

        if self.resource == 'Uranium ore':

            self.rates['Sulphuric acid'] = -self.rates[self.resource]





class TechRecipe(Recipe):

    def __init__(self, resource: str, producer: Item, rates: dict,

                 cost_multiplier: float, title: str = ''):

        self.cost_multiplier = cost_multiplier

        super().__init__(resource, producer, rates, title)



    def multiply_producer(self, lab: Item):

        self.rates[self.resource] /= self.cost_multiplier





class FluidRecipe(Recipe):

    # Pumpjacks, offshore pumps

    def multiply_producer(self, producer: Item):

        if producer.title == 'Pumpjack':

            yield_factor = 1.00  # Assumed

            rate = 10*yield_factor

        elif producer.title == 'Offshore pump':

            rate = 1200

        else:

            raise NotImplementedError()

        self.rates[self.resource] = rate





class RecipeFactory:

    def __init__(self, resource: Item, rates: dict = None):

        self.resource = resource

        self.producers = ()

        if rates:

            self.producers, self.title, self.rates = self.intermediate(rates)

        else:

            self.title = None

            needs_producers = False

            recipe = resource.recipe or resource.cost

            if recipe:

                self.rates = self.parse_recipe(recipe)

                if resource.prototype_type == 'technology':

                    self.producers = (all_items['lab'],)

                else:

                    needs_producers = True

            else:

                if resource.mining_time or 

                        resource.title in {'Crude oil', 'Water'}:

                    self.rates = {}

                    if resource.title != 'Raw wood':

                        needs_producers = True

                else:

                    raise NotImplementedError()

            if needs_producers:

                self.producers = tuple(parse_producers(resource.producers))



    def __str__(self) -> str:

        return self.title



    def intermediate(self, rates) -> (Iterable[Item], str, dict):

        building = rates.get('building')

        if building:

            producers = (all_items[building.lower()],)

        else:

            producers = parse_producers(self.resource.producers)

        title = rates['process']

        sane_rates = self.calc_recipe(rates['inputs'], rates['outputs'])

        return producers, title, sane_rates



    @staticmethod

    def parse_side(s: str) -> Dict[str, float]:

        out = {}

        for pair in s.split('+'):

            k, v = pair.split(',')

            out[k.strip()] = float(v.strip())

        return out



    @staticmethod

    def calc_recipe(inputs: Dict[str, float],

                    outputs: Dict[str, float]) -> Dict[str, float]:

        rates = defaultdict(float, outputs)

        if 'time' in inputs:

            k = 'time'

        else:

            k = 'Time'

        t = inputs.pop(k)

        for k in rates:

            rates[k] /= t

        for k, v in inputs.items():

            rates[k] -= v / t

        return rates



    def parse_recipe(self, recipe: str) -> Dict[str, float]:

        if '=' in recipe:

            inputs, outputs = recipe.split('=')

            outputs = self.parse_side(outputs)

        else:

            inputs = recipe

            outputs = {self.resource.title: 1}



        return self.calc_recipe(self.parse_side(inputs), outputs)



    def produce(self, cls, producer, **kwargs):

        kwargs.setdefault('title', self.title)

        recipe = cls(self.resource.title, producer, self.rates, **kwargs)

        if producer.pollution:

            recipe.rates['Pollution'] = float(producer.pollution)



        dims = tuple(float(x) for x in producer.dimensions.split('×'))

        recipe.rates['Area'] = dims[0] * dims[1]



        return recipe



    def for_energy(self, cls, **kwargs) -> Iterable[Recipe]:

        for producer in self.producers:

            energy = -parse_power(producer.energy)



            if 'electric' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Energy'] = energy

                yield recipe



            elif 'heat' in producer.energy:

                recipe = self.produce(cls, producer, **kwargs)

                recipe.rates['Heat'] = energy

                yield recipe



            elif 'burner' in producer.energy:

                for fuel_name in producer.valid_fuel.split('+'):

                    fuel_name = fuel_name.strip().lower()

                    fuel = all_items[fuel_name]

                    fuel_value = parse_power(fuel.fuel_value)

                    new_kwargs = dict(kwargs)

                    if self.title:

                        title = self.title

                    else:

                        title = f'{self.resource} ({producer})'

                    new_kwargs['title'] = f'{title} fueled by {fuel_name}'



                    recipe = self.produce(cls, producer, **new_kwargs)

                    recipe.rates[fuel.title] = energy / fuel_value

                    yield recipe

            else:

                raise NotImplementedError()



    tree_re = re.compile(r'(d+) .*?|([^}|]+)}')



    def wood_mining(self) -> Iterable[MiningRecipe]:

        miners = tuple(

            ManualMiner(tool)

            for tool in all_items.values()

            if tool.prototype_type == 'mining-tool'

        )

        for m in self.tree_re.finditer(self.resource.mining_time):

            mining_time, source = int(m[1]), m[2]

            for miner in miners:

                yield self.produce(

                    MiningRecipe, miner,

                    mining_hardness=float(self.resource.mining_hardness),

                    mining_time=mining_time,

                    title=f'{self.resource} ({miner} from {source})')



    def make(self) -> Iterable[Recipe]:

        if self.rates:

            if self.resource.prototype_type == 'technology':

                yield self.produce(

                    TechRecipe, self.producers[0],

                    cost_multiplier=float(self.resource.cost_multiplier))

            elif self.resource.title == 'Energy':

                yield self.produce(Recipe, self.producers[0])

            else:

                yield from self.for_energy(Recipe)

        elif self.resource.title == 'Raw wood':

            yield from self.wood_mining()

        elif self.resource.mining_time:

            yield from self.for_energy(

                MiningRecipe,

                mining_hardness=float(self.resource.mining_hardness),

                mining_time=float(self.resource.mining_time))

        elif self.resource.title == 'Crude oil':

            yield from self.for_energy(FluidRecipe)

        elif self.resource.title == 'Water':

            yield self.produce(FluidRecipe, self.producers[0])

        else:

            raise NotImplementedError()





def parse_power(s: str) -> float:

    m = power_re.search(s)

    return float(m[1]) * si_facs[m[2]]





def items_of_type(t: str) -> Iterable[Item]:

    return (i for i in all_items.values()

            if i.prototype_type == t)





barrel_re = re.compile(r'empty .+ barrel')





def parse_producers(s: str) -> Iterable[Item]:

    for p in s.split('+'):

        p = p.strip().lower()

        if p == 'furnace':

            yield from items_of_type('furnace')

        elif p == 'assembling machine':

            yield from (all_items[f'assembling machine {i}']

                        for i in range(1, 4))

        elif p == 'mining drill':

            yield from (all_items[f'{t} mining drill']

                        for t in ('burner', 'electric'))

        elif p == 'manual' or barrel_re.match(p):

            continue

        else:

            yield all_items[p]





def trim(items: dict):

    to_delete = tuple(k for k, v in items.items() if not v.keep)

    print(f'Dropping {len(to_delete)} items...')

    for k in to_delete:

        del items[k]





def energy_data() -> dict:

    solar_ave = parse_power(next(

        s for s in all_items['solar panel'].power_output.split('<br/>')

        if 'average' in s))



    eng = all_items['steam engine']

    eng_rate = float(eng.fluid_consumption

                     .split('/')[0])

    eng_power = parse_power(eng.power_output)



    turbine = all_items['steam turbine']

    turbine_rate = float(turbine.fluid_consumption

                         .split('/')[0])

    turbine_power_500 = 5.82e6  # ignore non-precise data and use this instead

    turbine_power_165 = 1.8e6   # from wiki page body



    return {

        'title': 'Energy',

        'recipes': (

            {

                'building': 'Solar panel',

                'process': 'Energy (Solar panel)',

                'inputs': {

                    'Time': 1

                },

                'outputs': {

                    'Energy': solar_ave

                }

            },

            {

                'building': 'Steam engine',

                'process': 'Energy (Steam engine)',

                'inputs': {

                    'Time': 1,

                    'Steam165': eng_rate

                },

                'outputs': {

                    'Energy': eng_power

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 165C)',

                'inputs': {

                    'Time': 1,

                    'Steam165': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_165

                }

            },

            {

                'building': 'Steam turbine',

                'process': 'Energy (Steam turbine @ 500C)',

                'inputs': {

                    'Time': 1,

                    'Steam500': turbine_rate

                },

                'outputs': {

                    'Energy': turbine_power_500

                }

            }

        )

    }





def load(fn: str):

    with lzma.open(fn) as f:

        global all_items

        all_items = {k.lower(): Item(d) for k, d in json.load(f).items()}

    all_items['energy'] = Item(energy_data())





def get_recipes() -> (Dict[str, Recipe], Set[str]):

    recipes = {}

    resources = set()

    for item in all_items.values():

        item_recipes = tuple(item.get_recipes())

        recipes.update({i.title: i for i in item_recipes})

        for recipe in item_recipes:

            resources.update(recipe.rates.keys())



    return recipes, resources





def field_size(names: Iterable) -> int:

    return max(len(str(o)) for o in names)





def write_csv_for_r(recipes: Sequence[Recipe], resources: Sequence[str],

                    fn: str):

    # Recipes going down, resources going right



    rec_width = field_size(recipes)

    float_width = 15

    col_format = f'{{:{float_width+8}}}'

    rec_format = 'n{:' + str(rec_width+1) + '}'



    with lzma.open(fn, 'wt') as f:

        f.write(' '*(rec_width+1))

        for res in resources:

            f.write(col_format.format(f'{res},'))



        for rec in recipes:

            f.write(rec_format.format(f'{rec},'))

            for res in resources:

                x = rec.rates.get(res, 0)

                col_format = f'{{:+{len(res)}.{float_width}e}},'

                f.write(col_format.format(x))





def write_for_numpy(recipes: Sequence[Recipe], resources: Sequence[str],

                    meta_fn: str, npz_fn: str):

    rec_names = [r.title for r in recipes]

    w_rec = max(len(r) for r in rec_names)

    recipe_names = np.array(rec_names, copy=False, dtype=f'U{w_rec}')



    w_res = max(len(r) for r in resources)

    resource_names = np.array(resources, copy=False, dtype=f'U{w_res}')



    np.savez_compressed(meta_fn, recipe_names=recipe_names, resource_names=resource_names)



    rec_mat = lil_matrix((len(resources), len(recipes)))

    for j, rec in enumerate(recipes):

        for res, q in rec.rates.items():

            i = resources.index(res)

            rec_mat[i, j] = q

    save_npz(npz_fn, rec_mat.tocsr())





def file_banner(fn):

    print(f'{fn} {getsize(fn)//1024} kiB')





def main():

    fn = 'items.json.xz'

    print(f'Loading {fn}... ', end='')

    load(fn)

    print(f'{len(all_items)} items')



    trim(all_items)



    print('Calculating recipes... ', end='')

    recipes, resources = get_recipes()

    print(f'{len(recipes)} recipes, {len(resources)} resources')



    resources = sorted(resources)

    recipes = sorted(recipes.values(), key=lambda i: i.title)



    print('Saving files for numpy...')

    meta_fn, npz_fn = 'recipe-names.npz', 'recipes.npz'

    write_for_numpy(recipes, resources, meta_fn, npz_fn)

    file_banner(meta_fn)

    file_banner(npz_fn)



    fn = 'recipes.csv.xz'

    print(f'Saving recipes for use by R...')

    stdout.flush()

    write_csv_for_r(recipes, resources, fn)

    file_banner(fn)





if __name__ == '__main__':

    main()

That's followed by an analysis script that I won't post here, to constrain the scope of this first review.

python numpy scipy

asked 18 mins ago

Reinderien

5,280926

asked 18 mins ago

Reinderien

5,280926

asked 18 mins ago

Reinderien

5,280926

asked 18 mins ago

Reinderien

5,280926

asked 18 mins ago

Reinderien

5,280926

add a comment |

0

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
return StackExchange.using("mathjaxEditing", function () {
StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
});
});
}, "mathjax-editing");

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "196"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
autoActivateHeartbeat: false,
convertImagesToLinks: false,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: null,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f217047%2ffactorio-analysis-data-munging%23new-answer', 'question_page');
}
);

Post as a guest

Name

Required, but never shown

0

active

oldest

votes

0

active

oldest

votes

draft saved

draft discarded

Thanks for contributing an answer to Code Review Stack Exchange!

Please be sure to answer the question. Provide details and share your research!

But avoid …

Asking for help, clarification, or responding to other answers.

Making statements based on opinion; back them up with references or personal experience.

Use MathJax to format equations. MathJax reference.

To learn more, see our tips on writing great answers.

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Required, but never shown

Name

Required, but never shown

Name

Required, but never shown

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Gfrktyl