Chapter 12 Utbildningsnivå

Befolkning efter region, ålder, utbildningsnivå, kön och år ålder 16-74 år kön män och kvinnor år 2017

utb <- readfile("UF0506A1.csv") 
utb$utbildningsnivå <- as.factor(utb$utbildningsnivå)
utb$utbildningsnivå <- factor(utb$utbildningsnivå, levels(utb$utbildningsnivå)[c(3, 1, 2, 6, 7, 5, 4, 8)])
utb %>%
  filter (utbildningsnivå != "uppgift om utbildningsnivå saknas") %>%
  mutate(lnkod_n = substr(region, 1,2)) %>%
  ggplot(aes(x = lnkod_n, y = salary, fill = utbildningsnivå)) +
    geom_col(position = "fill") +
    theme(legend.position="bottom") +
    facet_grid(. ~ kön)

Figure 12.1: Education distribution in different counties.

readfile("UF0506A1.csv") %>%
  group_by(utbildningsnivå, region, kön) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region) %>% mutate(perc = utbregno / sum(utbregno)) %>%
  filter (utbildningsnivå == "eftergymnasial utbildning, 3 år eller mer") %>%
  mutate(lnkod_n = as.numeric(substr(region, 1,2))) %>%
  right_join(map_ln_n, by = "lnkod_n") %>%
  ggplot() +
    geom_polygon(mapping = aes(x = ggplot_long, y = ggplot_lat, group = lnkod, fill = perc)) +
    coord_equal() +
    facet_grid(. ~ kön)

Figure 12.2: Percentage of the population who have 3 years or more post-secondary education, but not postgraduate education.

readfile("UF0506A1.csv") %>%
  group_by(utbildningsnivå, region) %>%
  summarize(utbregno = sum(salary)) %>%
  group_by(region) %>% mutate(perc = utbregno / sum(utbregno)) %>%
  filter (utbildningsnivå == "eftergymnasial utbildning, 3 år eller mer") %>%
  mutate(lnkod_n = as.numeric(substr(region, 1,2))) %>%
  right_join(salary_2017, by = c("lnkod_n" = "Länskod")) %>%
  ggscatter(x = "salary", y = "perc", 
    add = "reg.line", conf.int = TRUE, 
    cor.coef = TRUE, cor.method = "pearson") +
    labs(
      x = "Salary (SEK)",
      y = "Percent with 3 years post-secondary education"
    )

The correlation between the proportion of the population who have 3 years or more post-secondary education, but not postgraduate education, and the salaries of engineers in the region.

Figure 12.3: The correlation between the proportion of the population who have 3 years or more post-secondary education, but not postgraduate education, and the salaries of engineers in the region.

12.1 The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education in the region. Year 2014 - 2018

Average basic salary, monthly salary and women´s salary as a percentage of men´s salary by region, sector, occupational group (SSYK 2012) and sex . Year 2014 - 2018 Monthly salaty All sectors 214 Engineering professionals

Average basic salary, monthly salary and women´s salary as a percentage of men´s salary by region, sector, occupational group (SSYK 2012) and sex . Year 2014 - 2018 Number of employees All sectors 214 Engineering professionals

Population 16-74 years of age by region, highest level of education, age and sex. The year 1985 - 2018. total 16-74 years

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = sum_edu, y = perc_women, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Per cent of engineers who are women"
  )

## Warning: Removed 2 rows containing missing values (geom_point).

The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

Figure 12.4: The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "sum_edu", y = "perc_women", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Per cent of engineers who are women"
  )

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing non-finite values (stat_cor).

## Warning: Removed 2 rows containing missing values (geom_point).

Figure 12.5: The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)")
  
model <- lm(perc_women ~ sum_edu + year2 + log(salary.y), data = tb)

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Women engineers and population with 3 years or more post-secondary education')

Table 12.1: Women engineers and population with 3 years or more post-secondary education
term	estimate	std.error	statistic	p.value
(Intercept)	-885.3287385	229.7997838	-3.852609	0.0002512
sum_edu	0.0000214	0.0000016	13.652812	0.0000000
year2	0.4719358	0.1228379	3.841941	0.0002604
log(salary.y)	-4.6839227	3.6243664	-1.292342	0.2003709

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.1: Anova report from linear model fit
term	sumsq	df	statistic	p.value
sum_edu	292.335537	1	186.399278	0.0000000
year2	23.149348	1	14.760510	0.0002604
log(salary.y)	2.619345	1	1.670149	0.2003709
Residuals	112.919744	72	NA	NA

12.2 The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education in the regions, Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = sum_edu, y = sum_ing, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Number of the population who are engineers"
  )

The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

Figure 12.6: The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "sum_edu", y = "sum_ing", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Number of the population who are engineers"
  )

Figure 12.7: The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)")
  
model <- lm(sum_edu ~ year2 + log(salary.y) * sum_pop, data = tb)

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Population with 3 years or more post-secondary education and number of population')

Table 12.2: Population with 3 years or more post-secondary education and number of population
term	estimate	std.error	statistic	p.value
(Intercept)	5.129274e+06	5.118336e+06	1.0021372	0.3195877
year2	1.213056e+02	2.669735e+03	0.0454373	0.9638828
log(salary.y)	-5.089895e+05	1.206615e+05	-4.2183249	0.0000698
sum_pop	-7.867544e+00	1.172156e+00	-6.7120268	0.0000000
log(salary.y):sum_pop	7.616666e-01	1.099515e-01	6.9272996	0.0000000

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.2: Anova report from linear model fit
term	sumsq	df	statistic	p.value
year2	1490425	1	0.0020646	0.9638828
log(salary.y)	2556263146	1	3.5409595	0.0638575
sum_pop	567943896187	1	786.7211812	0.0000000
log(salary.y):sum_pop	34642763902	1	47.9874796	0.0000000
Residuals	52699616348	73	NA	NA

12.3 The correlation between the number of engineers and the proportion of engineers who are women in the regions, Year 2014 - 2018

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = perc_women, y = sum_ing, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  labs(
    x = "Per cent of engineers who are women",
    y = "Number of the population who are engineers"
  )

## Warning: Removed 2 rows containing missing values (geom_point).

Figure 12.8: The correlation between the number of engineers and the proportion of engineers who are women in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "perc_women", y = "sum_ing", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  labs(
    x = "Per cent of engineers who are women",
    y = "Number of the population who are engineers"
  )

## Warning: Removed 2 rows containing non-finite values (stat_smooth).

## Warning: Removed 2 rows containing non-finite values (stat_cor).

## Warning: Removed 2 rows containing missing values (geom_point).

Figure 12.9: The correlation between the number of engineers and the proportion of engineers who are women in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") 
  
model <- lm(sum_ing ~ year2 + log(salary.y) * sum_pop * perc_women, data = tb) 

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Engineers and per cent of engineers who are women')

Table 12.3: Engineers and per cent of engineers who are women
term	estimate	std.error	statistic	p.value
(Intercept)	-6.222017e+05	9.146425e+05	-0.6802676	0.4986790
year2	2.658654e+02	2.012575e+02	1.3210214	0.1909883
log(salary.y)	1.026812e+04	8.434278e+04	0.1217428	0.9034671
sum_pop	-1.893141e+00	7.269538e-01	-2.6042114	0.0113315
perc_women	2.644795e+04	4.740960e+04	0.5578605	0.5787989
log(salary.y):sum_pop	1.761138e-01	6.822090e-02	2.5815235	0.0120301
log(salary.y):perc_women	-2.623465e+03	4.465157e+03	-0.5875415	0.5588151
sum_pop:perc_women	6.696160e-02	3.382790e-02	1.9794780	0.0518730
log(salary.y):sum_pop:perc_women	-6.122800e-03	3.174900e-03	-1.9284725	0.0580350

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.3: Anova report from linear model fit
term	sumsq	df	statistic	p.value
year2	5867011	1	1.7450974	0.1909883
log(salary.y)	5235904	2	0.7786898	0.4631161
sum_pop	846906168	1	251.9057602	0.0000000
perc_women	156863189	2	23.3288777	0.0000000
log(salary.y):sum_pop	29562096	1	8.7930192	0.0041862
log(salary.y):perc_women	47669941	1	14.1790593	0.0003527
sum_pop:perc_women	199596081	1	59.3683271	0.0000000
log(salary.y):sum_pop:perc_women	12503284	1	3.7190063	0.0580350
Residuals	225253735	67	NA	NA

12.4 The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education in the regions, Year 2014 - 2018

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = sum_edu, y = salary.y, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  facet_grid(. ~ sex) +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Salary of engineers"
  )

The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

Figure 12.10: The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "sum_edu", y = "salary.y", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  facet_grid(. ~ sex) +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Salary of engineers"
  )

Figure 12.11: The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)")
 
tb1 <- tb %>% 
  ungroup() %>% 
  select(utbregno, year2, perc_edu, sum_pop, sum_edu, perc_women, perc_salary, sum_ing, perc_eng, salary.y) %>% 
  na.omit()

set.seed(1)

cmodel <- cubist(tb1[, -10], tb1$salary.y)
  
summary(cmodel)

## 
## Call:
## cubist.default(x = tb1[, -10], y = tb1$salary.y)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Sat Nov 09 22:47:29 2019
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 76 cases (10 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [76 cases, mean 42077.6, range 34700 to 49400, est err 784.7]
## 
##  outcome = -1504741.3 + 0.0585 sum_edu - 0.491 sum_ing - 0.027 utbregno
##            + 93129 perc_eng + 765 year2 - 17209 perc_edu
## 
## 
## Evaluation on training data (76 cases):
## 
##     Average  |error|              818.3
##     Relative |error|               0.39
##     Correlation coefficient        0.90
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    utbregno
##           100%    year2
##           100%    perc_edu
##           100%    sum_edu
##           100%    sum_ing
##           100%    perc_eng
## 
## 
## Time: 0.0 secs

model <- lm(log(salary.y) ~ year2 + perc_edu + sum_edu + sum_ing + perc_eng, data = tb)

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Salary of engineers and population with 3 years or more post-secondary education')

Table 12.4: Salary of engineers and population with 3 years or more post-secondary education
term	estimate	std.error	statistic	p.value
(Intercept)	-28.8586727	4.4852648	-6.434107	0.0000000
year2	0.0195703	0.0022270	8.787717	0.0000000
perc_edu	-0.6184356	0.0682540	-9.060795	0.0000000
sum_edu	0.0000010	0.0000002	6.356104	0.0000000
sum_ing	-0.0000092	0.0000027	-3.441580	0.0009666
perc_eng	1.5546134	0.5363753	2.898369	0.0049680

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.4: Anova report from linear model fit
term	sumsq	df	statistic	p.value
year2	0.0565259	1	77.223967	0.0000000
perc_edu	0.0600936	1	82.097998	0.0000000
sum_edu	0.0295718	1	40.400055	0.0000000
sum_ing	0.0086698	1	11.844473	0.0009666
perc_eng	0.0061490	1	8.400541	0.0049680
Residuals	0.0527021	72	NA	NA

12.5 The correlation between the proportion of the population who have 3 years or more post-secondary education and the number of the population who have 3 years or more post-secondary education in the regions, Year 2014 - 2018

tb <- readfile("000000CG_10.csv")
tb <- readfile("000000CD_10.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = perc_edu, y = salary, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  facet_grid(. ~ sex) +
  labs(
    x = "Per cent of the population with 3 years or more post-secondary education",
    y = "Number of the population with 3 years or more post-secondary education"
  )

Figure 12.12: The correlation between the proportion of the population who have 3 years or more post-secondary education, but not postgraduate education and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2014 - 2018.

12.6 The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education in the region. Year 2003 - 2013

Average basic salary, monthly salary and women´s salary as a percentage of men´s salary by region, sector, occupational group (SSYK 2012) and sex . Year 2003 - 2013 Monthly salaty All sectors 214 Engineering professionals

Average basic salary, monthly salary and women´s salary as a percentage of men´s salary by region, sector, occupational group (SSYK 2012) and sex . Year 2003 - 2013 Number of employees All sectors 214 Engineering professionals

Population 16-74 years of age by region, highest level of education, age and sex. The year 1985 - 2018. total 16-74 years

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = sum_edu, y = perc_women, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Per cent of engineers who are women"
  )

## Warning: Removed 12 rows containing missing values (geom_point).

Figure 12.13: The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "sum_edu", y = "perc_women", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Per cent of engineers who are women"
  )

## Warning: Removed 12 rows containing non-finite values (stat_smooth).

## Warning: Removed 12 rows containing non-finite values (stat_cor).

## Warning: Removed 12 rows containing missing values (geom_point).

Figure 12.14: The correlation between the proportion of engineers who are women and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)")
  
model <- lm(perc_women ~ sum_edu + year2 + log(salary.y), data = tb)

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Women engineers and population with 3 years or more post-secondary education')

Table 12.5: Women engineers and population with 3 years or more post-secondary education
term	estimate	std.error	statistic	p.value
(Intercept)	-667.8801009	174.6904512	-3.823220	0.0001945
sum_edu	0.0000181	0.0000025	7.379324	0.0000000
year2	0.3172738	0.0985096	3.220740	0.0015763
log(salary.y)	4.6156447	3.0814454	1.497883	0.1363226

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.5: Anova report from linear model fit
term	sumsq	df	statistic	p.value
sum_edu	304.71404	1	54.454420	0.0000000
year2	58.04577	1	10.373164	0.0015763
log(salary.y)	12.55495	1	2.243654	0.1363226
Residuals	816.98142	146	NA	NA

12.7 The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education in the regions, Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = sum_edu, y = sum_ing, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Number of the population who are engineers"
  )

Figure 12.15: The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "sum_edu", y = "sum_ing", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Number of the population who are engineers"
  )

Figure 12.16: The correlation between the number of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)")
  
model <- lm(sum_edu ~ year2 + log(salary.y) * sum_pop, data = tb)

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Population with 3 years or more post-secondary education and number of population')

Table 12.6: Population with 3 years or more post-secondary education and number of population
term	estimate	std.error	statistic	p.value
(Intercept)	1.267390e+06	1.745676e+06	0.7260168	0.4689094
year2	1.147657e+02	9.712605e+02	0.1181616	0.9060907
log(salary.y)	-1.463272e+05	4.449966e+04	-3.2882775	0.0012441
sum_pop	-3.221561e+00	4.014156e-01	-8.0254995	0.0000000
log(salary.y):sum_pop	3.274354e-01	3.833540e-02	8.5413335	0.0000000

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.6: Anova report from linear model fit
term	sumsq	df	statistic	p.value
year2	8075342	1	0.0139622	0.9060907
log(salary.y)	15636315939	1	27.0349810	0.0000006
sum_pop	863975869744	1	1493.8027150	0.0000000
log(salary.y):sum_pop	42194876760	1	72.9543772	0.0000000
Residuals	90804635841	157	NA	NA

12.8 The correlation between the number of engineers and the proportion of engineers who are women in the regions, Year 2003 - 2013

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = perc_women, y = sum_ing, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  labs(
    x = "Per cent of engineers who are women",
    y = "Number of the population who are engineers"
  )

## Warning: Removed 12 rows containing missing values (geom_point).

Figure 12.17: The correlation between the number of engineers and the proportion of engineers who are women in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "perc_women", y = "sum_ing", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  labs(
    x = "Per cent of engineers who are women",
    y = "Number of the population who are engineers"
  )

## Warning: Removed 12 rows containing non-finite values (stat_smooth).

## Warning: Removed 12 rows containing non-finite values (stat_cor).

## Warning: Removed 12 rows containing missing values (geom_point).

Figure 12.18: The correlation between the number of engineers and the proportion of engineers who are women in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") 
  
model <- lm(sum_ing ~ year2 + log(salary.y) * sum_pop * perc_women, data = tb) 

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Engineers and per cent of engineers who are women')

Table 12.7: Engineers and per cent of engineers who are women
term	estimate	std.error	statistic	p.value
(Intercept)	3.317429e+05	1.975462e+05	1.6793176	0.0953048
year2	-1.936866e+02	6.830468e+01	-2.8356265	0.0052469
log(salary.y)	5.175619e+03	1.587614e+04	0.3259997	0.7449079
sum_pop	1.704584e-01	1.735824e-01	0.9820029	0.3277804
perc_women	2.164406e+03	8.400848e+03	0.2576413	0.7970594
log(salary.y):sum_pop	-1.568240e-02	1.661170e-02	-0.9440625	0.3467526
log(salary.y):perc_women	-2.103752e+02	8.024072e+02	-0.2621801	0.7935652
sum_pop:perc_women	-9.808600e-03	7.946800e-03	-1.2342769	0.2191529
log(salary.y):sum_pop:perc_women	9.736000e-04	7.581000e-04	1.2842003	0.2011780

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.7: Anova report from linear model fit
term	sumsq	df	statistic	p.value
year2	20082665	1	8.040778	0.0052469
log(salary.y)	24237236	1	9.704202	0.0022277
sum_pop	3210584661	1	1285.466737	0.0000000
perc_women	121504617	2	24.324252	0.0000000
log(salary.y):sum_pop	3146828	1	1.259939	0.2635703
log(salary.y):perc_women	8443149	1	3.380502	0.0680760
sum_pop:perc_women	28304085	1	11.332503	0.0009817
log(salary.y):sum_pop:perc_women	4118972	1	1.649170	0.2011780
Residuals	352161922	141	NA	NA

12.9 The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education in the regions, Year 2003 - 2013

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggplot(aes(x = sum_edu, y = salary.y, colour = region, size = year2)) +
  geom_point() +
  theme(legend.position="bottom") +
  facet_grid(. ~ sex) +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Salary of engineers"
  )

Figure 12.19: The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)") %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  ggscatter(x = "sum_edu", y = "salary.y", 
  add = "reg.line", conf.int = TRUE, 
  cor.coef = TRUE, cor.method = "pearson") +
  facet_grid(. ~ sex) +
  labs(
    x = "Number of the population with 3 years or more post-secondary education",
    y = "Salary of engineers"
  )

Figure 12.20: The correlation between the salary of engineers and the number of the population who have 3 years or more post-secondary education, but not postgraduate education in the regions (NUTS2), Year 2003 - 2013.

tb <- readfile("AM0110A2_3.csv")
tb <- readfile("AM0110A4_3.csv") %>% 
  left_join(tb, by = c("region", "year", "sex")) %>%
  group_by (`region`, year) %>%   
  mutate (perc_women = as.numeric (sub ("%", "", perc_women (salary.x)))) %>%
  mutate (perc_salary = as.numeric (sub ("%", "", perc_sal (salary.y)))) %>%
  mutate (sum_ing = sum(salary.x))  
tb <- readfile("UF0506A1_1.csv") %>%
  group_by(`level of education`, region, year, sex) %>%
  mutate(utbregno = sum(salary)) %>%
  group_by(region, year, sex) %>% mutate(perc_edu = utbregno / sum(utbregno)) %>%
  group_by(region, year) %>% mutate(sum_pop = sum(utbregno)) %>%
  group_by(`level of education`, region, year) %>%
  mutate (sum_edu = sum(utbregno)) %>%
  right_join(tb, by = c("region", "year", "sex")) %>%
  mutate (perc_eng = sum_ing / sum_edu) %>%
  filter (`level of education` == "post-secondary education 3 years or more (ISCED97 5A)")
 
tb1 <- tb %>% 
  ungroup() %>% 
  select(utbregno, year2, perc_edu, sum_pop, sum_edu, perc_women, perc_salary, sum_ing, perc_eng, salary.y) %>% 
  na.omit()

set.seed(1)

cmodel <- cubist(tb1[, -10], tb1$salary.y)
  
summary(cmodel)

## 
## Call:
## cubist.default(x = tb1[, -10], y = tb1$salary.y)
## 
## 
## Cubist [Release 2.07 GPL Edition]  Sat Nov 09 22:47:35 2019
## ---------------------------------
## 
##     Target attribute `outcome'
## 
## Read 150 cases (10 attributes) from undefined.data
## 
## Model:
## 
##   Rule 1: [150 cases, mean 35084.0, range 26700 to 44500, est err 938.8]
## 
##  outcome = -1489479.7 + 0.1162 sum_edu - 0.0929 utbregno - 0.412 sum_ing
##            - 0.00608 sum_pop + 755 year2 - 24080 perc_edu
##            + 61927 perc_eng + 112 perc_women + 57 perc_salary
## 
## 
## Evaluation on training data (150 cases):
## 
##     Average  |error|              897.4
##     Relative |error|               0.28
##     Correlation coefficient        0.96
## 
## 
##  Attribute usage:
##    Conds  Model
## 
##           100%    utbregno
##           100%    year2
##           100%    perc_edu
##           100%    sum_pop
##           100%    sum_edu
##           100%    perc_women
##           100%    perc_salary
##           100%    sum_ing
##           100%    perc_eng
## 
## 
## Time: 0.0 secs

model <- lm(log(salary.y) ~ year2 + perc_edu + sum_edu + sum_ing + perc_eng, data = tb)

summary(model) %>%  
  tidy() %>%
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Salary of engineers and population with 3 years or more post-secondary education')

Table 12.8: Salary of engineers and population with 3 years or more post-secondary education
term	estimate	std.error	statistic	p.value
(Intercept)	-45.0262481	2.6066108	-17.2738666	0.0000000
year2	0.0276718	0.0013012	21.2656449	0.0000000
perc_edu	-1.3416735	0.1256641	-10.6766661	0.0000000
sum_edu	0.0000012	0.0000003	4.5712949	0.0000098
sum_ing	-0.0000039	0.0000042	-0.9245089	0.3566493
perc_eng	-0.4289334	0.5025827	-0.8534583	0.3947138

Anova(model, type=2) %>% 
  tidy() %>% 
  knitr::kable( 
  booktabs = TRUE,
  caption = 'Anova report from linear model fit')

Table 12.8: Anova report from linear model fit
term	sumsq	df	statistic	p.value
year2	1.0810807	1	452.2276514	0.0000000
perc_edu	0.2725036	1	113.9911980	0.0000000
sum_edu	0.0499551	1	20.8967375	0.0000098
sum_ing	0.0020433	1	0.8547167	0.3566493
perc_eng	0.0017413	1	0.7283910	0.3947138
Residuals	0.3729285	156	NA	NA