# Update knitr chunk options
# https://yihui.name/knitr/options/#chunk-options
knitr::opts_chunk$set(
  cache = FALSE,
  # dependencies 
  autodep = TRUE, 
  # Don't rerun if only comments changed
  cache.comments = FALSE,
  cache.lazy = TRUE, 
  echo = FALSE, 
  eval = TRUE,
  comment = NA,
  fig.align = "center",
  tidy = TRUE,
  fig.width = 8,
  fig.height = 6,
  out.width = "100%",
  echo = TRUE,
  warning = FALSE,
  message = FALSE,
  fig.align = "center",
  fig.path = paste0("figure/", knitr::current_input(), "/")
)

Last updated: 2017-09-22

Code version: 37e505a

Color distributions by topic

The last thing we’ll look at before presenting plots for the final model is the color distribution over each topic. This gives us a picture of what our color themes actually are!

library(dplyr)
library(purrr)
library(ggplot2)
Connecting to database 
Assigning themes to theme_df 
Assigning sets to sets_df 
Retrieving dataset form db 
Disconnecting from database 
Assigning full set set inventories to 'set_colors' 
knitr::read_chunk(here::here("code", "compare-models.R"))

Color distributions over topics

For these plots the distribution is represented by a weighted relevance score (the same that is used in the [LDAvis` package](http://www.kennyshirley.com/LDAvis/#topic=0&lambda=0.61&term=).

The beta \(\beta\) matrix, gives the posterior distribution of words given a topic, \(p(w|t)\). Relevance is computed \[ \text{relevance}(w|t) = \lambda \cdot p(w|t) + (1-\lambda)\cdot \frac{p(w|t)}{p(w)}. \]

library(dplyr)
library(ggplot2)

if (!exists("set_colors")) {
    legolda::load_csv(sample_data = FALSE)
    legolda::create_tables(sample_data = FALSE)
}

lda_models <- readRDS(here::here("inst", "data", "lda_models_all.RDS"))

set_topics <- lda_models %>% purrr::map(function(x) {
    class(x) <- "LDA"
    x
}) %>% purrr::map(tidytext::tidy, matrix = "gamma")

# Total frequency used in relevance score
word_freq <- set_colors %>% count(rgba) %>% mutate(percent = n/nrow(set_colors))

# Create palette
pal <- unique(set_colors$rgba)
names(pal) <- unique(pal)

# Plot weighted relevance of terms/colors for each topic
plot_relevance <- function(top_terms, bgcol) {
    
    ntopics <- max(top_terms$topic)
    subtitle <- paste0("Weighted color distribution for ", ntopics, " topics")
    
    top_terms %>% ggplot(aes(x = -order, y = relevance, fill = term)) + labs(x = "", 
        y = "Color relevance to topic", title = "Lego color topics", subtitle = subtitle) + 
        geom_col(show.legend = FALSE) + facet_wrap(~topic, scales = "free", 
        nrow = 5) + scale_fill_manual(values = pal) + coord_flip() + theme_bar(bgcol) + 
        theme(axis.text.x = element_blank(), axis.text.y = element_blank(), 
            panel.grid.major.x = element_blank())
}

top_list <- lda_models %>% purrr::map(legolda::top_terms, lambda = 0.7, nterms = 7, 
    freq = word_freq)

bgcol = "#a8a4a2"

plot_relevance(top_list[[2]], bgcol)  # 20 topics

plot_relevance(top_list[[3]], bgcol)  # 40 topics

plot_relevance(top_list[[5]], bgcol)  # 60 topics

## How many themes?

Even though our model scores might have leaned towards a model with fewere topics, we can see specific topics where adding more models separates themes that appear to be quite different. The firs two examples are of topic # 2 from the 30 topic model which seems more coherent in the 40 topic model (the sencond plot).

Topic 2 from the 30 topic model

model_num <- 2
topic_num <- 2

library(waffle)
plot_topic <- set_topics[[model_num]] %>% dplyr::filter(topic == topic_num) %>% 
    dplyr::arrange(desc(gamma)) %>% head(20)


waffle_prep <- function(document, sets) {
    document$set_num <- document$document
    document %>% left_join(sets, by = "set_num") %>% select(set_num, name, theme, 
        year, rgba) %>% group_by(theme, name, set_num, year) %>% tidyr::nest() %>% 
        mutate(counts = purrr::map(data, table))
}

bgcol <- "#e8e4e2"
w1 <- waffle_prep(plot_topic[1, ], set_colors)
w2 <- waffle_prep(plot_topic[4, ], set_colors)
w3 <- waffle_prep(plot_topic[6, ], set_colors)
w4 <- waffle_prep(plot_topic[7, ], set_colors)

waffle::iron(waff(w1, size = 0.5, rows = 1, nchr = 20, bgcol = bgcol), waff(w2, 
    size = 2, rows = 1, nchr = 13, bgcol = bgcol), waff(w3, size = 0.2, rows = 2, 
    nchr = 20, bgcol = bgcol), waff(w4, size = 0.2, rows = 4, nchr = 20, bgcol = bgcol))

# Which sets are most associated with a topic
model_num <- 2
topic_num <- 2

view_topic <- set_topics[[model_num]] %>% filter(topic == topic_num) %>% arrange(desc(gamma)) %>% 
    head(50) %>% mutate(set_num = document, gamma = round(gamma, 2)) %>% left_join(sets_df, 
    by = "set_num") %>% mutate(set_name = stringr::str_sub(name, 1, 20)) %>% 
    select(topic, gamma, set_name, set_num, theme_id, year, num_parts) %>% left_join(theme_df, 
    by = c(theme_id = "id")) %>% mutate(theme_name = name) %>% select(topic, 
    gamma, set_name, set_num, theme_name, theme_id, year, num_parts)

knitr::kable(view_topic, caption = paste0("Sets most associated with topic", 
    topic_num))
Sets most associated with topic2
topic gamma set_name set_num theme_name theme_id year num_parts
2 0.83 Mr. Magoriums big bo 66208-1 Basic Model 23 2007 9
2 0.76 Traffic Police Set 1271-2 Town Plan 372 1956 6
2 0.76 Traffic Police Set 271-2 Town Plan 372 1958 6
2 0.73 Supplementary Disks 8508-1 Throwbot Slizer 20 1999 5
2 0.68 Pneumatic Value Pack 5110-1 Technic 453 1990 4
2 0.68 Bangle Minis 7501-1 Clikits 500 2003 32
2 0.62 Trendy Tote Hot Pink 7510-1 Clikits 500 2003 89
2 0.61 Advent Calendar 2005 7574-18 Clikits 222 2005 8
2 0.56 Pretty in Pink Jewel 7533-1 Clikits 500 2005 66
2 0.56 Trendy Tote Sky Blue 7512-1 Clikits 500 2003 94
2 0.56 Pearly Pink Bracelet 7554-1 Clikits 500 2006 63
2 0.52 Bricks and Creations 4679-1 Basic Set 37 2004 2
2 0.52 Bricks and Creations 4679-2 Basic Set 37 2005 2
2 0.52 LEGO Creative Value 66311-1 Basic Set 37 2010 2
2 0.52 Motorized Simple Mac 9645-1 Technic 529 1997 2
2 0.52 Racers Turbo Pack 65062-1 Drome Racers 113 2002 2
2 0.52 Vladek Value Pack 65769-1 Knights Kingdom II 198 2005 4
2 0.52 Advent Calendar 2005 7574-13 Clikits 222 2005 4
2 0.52 Advent Calendar 2005 7574-17 Clikits 222 2005 4
2 0.51 Sweet Dreamy Jewels 7514-1 Clikits 500 2004 12
2 0.49 Volkswagen Beetle (V 10187-1 Sculptures 276 2008 1625
2 0.46 Nitro Muscle 8194-1 Tiny Turbos 120 2010 47
2 0.46 Rip 4574-1 Xalax 125 2001 7
2 0.46 Galaxy Patrol - Comp 8831-8 Series 7 Minifigures 542 2012 7
2 0.45 Jewels-n-Rings 7507-1 Clikits 500 2003 80
2 0.45 Blooms & Butterflies 7557-1 Clikits 500 2005 14
2 0.44 Rebel A-wing Pilot 5004408-1 Star Wars Rebels 182 2016 5
2 0.42 Advent Calendar 2005 7574-19 Clikits 222 2005 9
2 0.42 Interface Card and C 9771-1 Supplemental 532 1989 3
2 0.42 Tropical Breeze Jewe 7546-1 Clikits 500 2006 73
2 0.41 Advent Calendar 2004 7575-8 Clikits 222 2004 4
2 0.41 Le Fleuriste Collect lfv3-1 Other 301 2010 350
2 0.41 Advent Calendar 2005 7574-5 Clikits 222 2005 8
2 0.40 Friendship Frame / M 7504-1 Clikits 500 2004 15
2 0.39 Micro Mecha Horse MYERNEXO-2 Nexo Knights 605 2016 24
2 0.39 Pretty in Pink Beaut 7527-1 Clikits 500 2005 136
2 0.37 Captain America 41589-1 Brickheadz 610 2017 79
2 0.36 12V Replacement Elec 703-1 12V 242 1969 1
2 0.36 Advent Calendar 2011 7958-1 Star Wars 209 2011 25
2 0.36 Adventurers Value Pa 1024601-1 Desert 297 2001 3
2 0.36 Click-N-Store Jewelr 65542-1 Clikits 500 2004 2
2 0.36 Color Sensor for Min MS1038-1 NXT 259 2006 1
2 0.36 Environment Plate 359-1 Supplemental 473 1972 1
2 0.36 Infrared Seeker for 2852725-1 NXT 259 2011 1
2 0.36 Jewels-n-Bands Click 65363-1 Clikits 500 2004 2
2 0.36 Jewels-n-Clips Click 65364-1 Clikits 500 2004 2
2 0.36 Jewels-n-Rings Click 65362-1 Clikits 500 2004 2
2 0.36 Knights’ Kingdom Adv 50799-1 Knights Kingdom II 198 2005 3
2 0.36 Knights’ Kingdom Val kk2vp1-1 Knights Kingdom II 198 2004 3
2 0.36 Knights’ Kingdom Val kk2vp2-1 Knights Kingdom II 198 2004 3
saveRDS(view_topic, here::here("inst", "data", "view-topic-2-2.RDS"))

Topic 2 from the 40 topic model

model_num <- 3
topic_num <- 2

library(waffle)
plot_topic <- set_topics[[model_num]] %>% dplyr::filter(topic == topic_num) %>% 
    dplyr::arrange(desc(gamma)) %>% head(20)


waffle_prep <- function(document, sets) {
    document$set_num <- document$document
    document %>% left_join(sets, by = "set_num") %>% select(set_num, name, theme, 
        year, rgba) %>% group_by(theme, name, set_num, year) %>% tidyr::nest() %>% 
        mutate(counts = purrr::map(data, table))
}

bgcol <- "#e8e4e2"
w1 <- waffle_prep(plot_topic[1, ], set_colors)
w2 <- waffle_prep(plot_topic[4, ], set_colors)
w3 <- waffle_prep(plot_topic[6, ], set_colors)
w4 <- waffle_prep(plot_topic[8, ], set_colors)

waffle::iron(waff(w1, size = 0.5, rows = 1, nchr = 20, bgcol = bgcol), waff(w2, 
    size = 2, rows = 4, nchr = 13, bgcol = bgcol), waff(w3, size = 0.2, rows = 3, 
    nchr = 20, bgcol = bgcol), waff(w4, size = 0.2, rows = 2, nchr = 20, bgcol = bgcol))

# Which sets are most associated with a topic
model_num <- 3
topic_num <- 2

view_topic <- set_topics[[model_num]] %>% filter(topic == topic_num) %>% arrange(desc(gamma)) %>% 
    head(50) %>% mutate(set_num = document, gamma = round(gamma, 2)) %>% left_join(sets_df, 
    by = "set_num") %>% mutate(set_name = stringr::str_sub(name, 1, 20)) %>% 
    select(topic, gamma, set_name, set_num, theme_id, year, num_parts) %>% left_join(theme_df, 
    by = c(theme_id = "id")) %>% mutate(theme_name = name) %>% select(topic, 
    gamma, set_name, set_num, theme_name, theme_id, year, num_parts)

knitr::kable(view_topic, caption = "Sets most associated with topic 37")
Sets most associated with topic 37
topic gamma set_name set_num theme_name theme_id year num_parts
2 0.56 Rip 4574-1 Xalax 125 2001 7
2 0.51 Volkswagen Beetle (V 10187-1 Sculptures 276 2008 1625
2 0.50 Gavla 8948-1 Matoran of Light 333 2008 14
2 0.50 Toa Gali 8688-1 Mistika 338 2008 60
2 0.49 Piraka 7137-1 Stars 345 2010 15
2 0.46 Nitro Muscle 8194-1 Tiny Turbos 120 2010 47
2 0.46 Galaxy Patrol - Comp 8831-8 Series 7 Minifigures 542 2012 7
2 0.45 Micro Mecha Horse MYERNEXO-2 Nexo Knights 605 2016 24
2 0.44 Dunkan Bulk 7168-1 Heroes 401 2010 17
2 0.42 Tarix 8981-1 Glatorian 331 2009 57
2 0.41 Le Fleuriste Collect lfv3-1 Other 301 2010 350
2 0.39 Captain America 41589-1 Brickheadz 610 2017 79
2 0.38 Vahki Bordakh 8615-1 Vahki 357 2004 32
2 0.37 Toa Mahri Hahli 8914-1 Toa Mahri 352 2007 58
2 0.37 Vahki Bordakh Limite 8615-2 Vahki 357 2004 33
2 0.37 Kendo Fighter 71011-12 Series 15 Minifigures 554 2016 7
2 0.36 Robin NEX271714-1 Nexo Knights 605 2017 19
2 0.36 Globert 41533-1 Series 4 584 2015 45
2 0.36 Vamprah 8692-1 Phantoka 339 2008 48
2 0.36 Gelu 8988-1 Glatorian Legends 332 2009 52
2 0.35 Advent Calendar 2014 75056-7 Star Wars 225 2014 9
2 0.34 ChromaStone 8411-1 Ben 10 270 2010 21
2 0.34 Kiina 8987-1 Glatorian Legends 332 2009 43
2 0.33 Advent Calendar 2008 7979-4 Castle 219 2008 8
2 0.32 Hockey Player - Comp 8804-8 Series 4 Minifigures 539 2011 11
2 0.32 Vezok 8902-1 Piraka 340 2006 41
2 0.31 Advent Calendar 2008 7979-2 Castle 219 2008 5
2 0.31 Toa Nokama 8602-1 Toa Metru 353 2004 46
2 0.31 Visorak Battle Ram 8757-1 Playsets 341 2005 190
2 0.31 Advent Calendar 2010 7952-14 Castle 219 2010 5
2 0.31 MTT 30059-1 Star Wars 158 2012 51
2 0.31 Spidermonkey 8409-1 Ben 10 270 2010 21
2 0.30 Loki’s Cosmic Cube E 6867-1 Avengers 487 2012 180
2 0.30 Combo NEXO Powers Wa 70372-1 Nexo Knights 605 2017 5
2 0.30 Toa Tahu 8689-1 Mistika 338 2008 73
2 0.30 Rahaga Gaaki 4868-1 Rahaga 342 2005 28
2 0.29 Vulture Droid foil p SW911723-1 Star Wars Episode 3 162 2017 35
2 0.29 Inika Toa Hahli 8728-1 Toa Inika 351 2006 46
2 0.29 Joachim Löw 71014-1 DFB Minifigures 557 2016 6
2 0.29 Jor-El 5001623-1 Superman 489 2013 5
2 0.29 Police Patrol 4963-1 Duplo 504 2006 5
2 0.29 Boogly 41535-1 Series 4 584 2015 52
2 0.29 Lava Chamber Gate 8893-1 Playsets 341 2006 375
2 0.29 Starblaster Showdown 76019-1 Guardians of the Galaxy 483 2014 195
2 0.29 Dalu 8726-1 Matoran of Voya Nui 337 2006 25
2 0.28 Sir Jayko 8792-1 Knights Kingdom II 198 2005 42
2 0.28 Duplo Airport Rescue 7844-1 Duplo 504 2004 28
2 0.28 Vampos 41534-1 Series 4 584 2015 59
2 0.28 Sir Adric 8704-1 Knights Kingdom II 198 2006 40
2 0.28 Inika Toa Hewkii 8730-1 Toa Inika 351 2006 62
saveRDS(view_topic, here::here("inst", "data", "view-topic-3-2.RDS"))

One final plot from topic 32 that I looked questionable but seems to have grouped some related (if small) sets.

# Which sets are most associated with a topic
model_num <- 3
topic_num <- 32

plot_topic <- set_topics[[model_num]] %>% dplyr::filter(topic == topic_num) %>% 
    dplyr::arrange(desc(gamma)) %>% head(10)
# plot_topic

waffle_prep <- function(document, sets) {
    document$set_num <- document$document
    document %>% left_join(sets, by = "set_num") %>% select(set_num, name, theme, 
        year, rgba) %>% group_by(theme, name, set_num, year) %>% tidyr::nest() %>% 
        mutate(counts = purrr::map(data, table))
}

w1 <- waffle_prep(plot_topic[1, ], set_colors)
w2 <- waffle_prep(plot_topic[4, ], set_colors)
w3 <- waffle_prep(plot_topic[6, ], set_colors)
w4 <- waffle_prep(plot_topic[7, ], set_colors)

waffle::iron(waff(w1, size = 0.5, rows = 1, nchr = 20, bgcol = bgcol), waff(w2, 
    size = 0.5, rows = 4, nchr = 18, bgcol = bgcol), waff(w3, size = 0.5, rows = 1, 
    nchr = 19, bgcol = bgcol), waff(w4, size = 0.5, rows = 4, nchr = 20, bgcol = bgcol))

model_num <- 3
topic_num <- 32

view_topic <- set_topics[[model_num]] %>% filter(topic == topic_num) %>% arrange(desc(gamma)) %>% 
    head(50) %>% mutate(set_num = document, gamma = round(gamma, 2)) %>% left_join(sets_df, 
    by = "set_num") %>% mutate(set_name = stringr::str_sub(name, 1, 20)) %>% 
    select(topic, gamma, set_name, set_num, theme_id, year, num_parts) %>% left_join(theme_df, 
    by = c(theme_id = "id")) %>% mutate(theme_name = name) %>% select(topic, 
    gamma, set_name, set_num, theme_name, theme_id, year, num_parts)

saveRDS(view_topic, here::here("inst", "data", "view-topic-32-3.RDS"))

knitr::kable(view_topic, caption = paste0("Sets most associated with topic ", 
    topic_num))
Sets most associated with topic 32
topic gamma set_name set_num theme_name theme_id year num_parts
32 0.77 Advent Calendar 2007 7907-19 City 220 2007 9
32 0.60 Advent Calendar 2008 7979-16 Castle 219 2008 9
32 0.57 Advent Calendar 2009 7687-10 City 220 2009 7
32 0.56 Small Plates, Disks 5198-1 Service Packs 443 1989 56
32 0.55 Light Prisms & Holde 1147-1 Train 456 1981 7
32 0.55 Light Transmitting E 5073-1 Train 456 1987 7
32 0.52 LEGO Heart (Legoland llca8-1 Legoland Parks 425 2004 58
32 0.50 Kanoka Launcher And 3259-1 Supplemental 346 2004 2
32 0.48 Advent Calendar 2014 60063-14 City 220 2014 8
32 0.46 Surface Hopper 6806-1 Classic Space 130 1985 23
32 0.46 Space Light and Rada 5042-1 Space 452 1991 50
32 0.46 Small Plates with To 5053-1 Service Packs 443 1993 76
32 0.45 Advent Calendar 2006 7904-11 City 220 2006 6
32 0.45 Advent Calendar 2005 7324-13 City 220 2005 6
32 0.44 Transparent Bricks 16-1 Service Packs 443 1988 32
32 0.44 Muji Christmas Set 8465934-1 Other 301 2009 120
32 0.44 Light and Transparen 9866-1 Technic 1 1992 12
32 0.44 Light Bricks (4.5V) 1344-1 Service Packs 524 1986 12
32 0.44 Advent Calendar 2009 6299-3 Pirates 224 2009 9
32 0.41 Advent Calendar 1998 1298-4 Classic Basic 221 1998 9
32 0.41 Holiday Ornament wit 853344-1 Christmas 227 2011 32
32 0.41 Advent Calendar 2005 7324-15 City 220 2005 11
32 0.41 Monthly Mini Model B 40040-1 Monthly Mini Model Build 409 2012 15
32 0.41 Transparent Bricks 5156-1 Service Packs 443 1991 17
32 0.41 Wizard 7955-1 Kingdoms 196 2010 19
32 0.41 Advent Calendar 2013 60024-13 City 220 2013 21
32 0.40 Darth Maul 5000062-1 Minifig Pack 178 2012 3
32 0.40 Advent Calendar 2010 7952-10 Castle 219 2010 5
32 0.40 Holiday Ornament wit 853346-1 Christmas 227 2011 38
32 0.40 Ring of Fire 70100-1 Speedorz 572 2013 78
32 0.39 MUJI Christmas Set e1a1404-1 Other 301 2011 120
32 0.39 Advent Calendar 2004 4924-24 Creator 223 2004 12
32 0.39 Advent Calendar 1998 1298-13 Classic Basic 221 1998 12
32 0.37 The Collector - San comcon035-1 Guardians of the Galaxy 483 2014 8
32 0.36 Glider 5966-1 Exo-Force 389 2006 22
32 0.36 Ultimate Macy 70331-1 Nexo Knights 605 2016 101
32 0.36 Advent Calendar 2007 7600-20 Belville 218 2007 10
32 0.36 Monthly Mini Model B 40039-1 Monthly Mini Model Build 409 2012 16
32 0.36 Color Light 4056-1 Studios 273 2001 13
32 0.36 Cosmic Comet 6825-1 Classic Space 130 1985 40
32 0.36 Transparent Plates, 5128-1 Service Packs 443 1996 80
32 0.34 Advent Calendar 2002 4524-21 Creator 223 2002 10
32 0.34 Advent Calendar 2007 7600-25 Belville 218 2007 9
32 0.33 Gray Space Elements 13-1 Space 452 1981 10
32 0.33 Lighting Set Electri 7861-1 12V 242 1980 28
32 0.33 Advent Calendar 2002 4524-15 Creator 223 2002 11
32 0.32 Chopper Cop 6324-1 Police 100 1998 24
32 0.32 Sabah Promotional Se 1778-1 Basic Model 468 1997 9
32 0.32 Advent Calendar 2011 7553-7 City 220 2011 18
32 0.32 Advent Calendar 2000 2250-2 Advent Sub-Set 217 2000 12