泛科學網站爬蟲
#徐晣彧-大數據分析-隨堂測驗
#安裝套件
install.packages(c(“httr”,”xml2″,”rvest”,”stringr”,”dplyr”,”ggplot2″))
install.packages(“purrr”)
#載入
library(httr)
library(xml2)
library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
#擷取網站
pan <- GET(“http://pansci.asia/hots/month”)
pan
content(pan,”text”)
#讀取html
pan_html <- read_html(pan)
pan_html
#抓取標題時間與觀看次數
pan_title <- html_nodes(pan_html, “.title”)
pan_title <- html_text(pan_title)
#把文字內容抓出來
View(pan_title[3:32])
pan_title<-pan_title[3:32]
#瀏覽次數
pan_view <- html_nodes(pan_html,”.last”)
pan_view <- html_text(pan_view)
View(pan_view[5:34])
pan_view<-pan_view[5:34]
pan_df <- data.frame(
pan_title = pan_title,
pan_view= as.numeric(pan_view)
)
View(pan_df)
#30個排名
pan_df$pan_rank <- rev(rank(pan_df$pan_view))
pan_df$pan_level <- ifelse(pan_df$pan_view > 33000,”1″,”2″)
pan_df$pan_titlenum <- nchar(as.character(pan_df$pan_title))
View(pan_df)
install.packages(“wordcloud2″)
library(wordcloud2)
wordcloud2(pan_df,size = 0.14 ,minSize = 1, gridSize = 2,color=”random-light”,backgroundColor = “black”)
print(“05154152”)
print(“徐晣彧”)
引用通告: W16 博群行銷演講分享與心得 – 徐晣彧的學習平台